diff --git a/GPy/__init__.py b/GPy/__init__.py
index 5e091170..26713406 100644
--- a/GPy/__init__.py
+++ b/GPy/__init__.py
@@ -3,23 +3,23 @@
 import warnings
 warnings.filterwarnings("ignore", category=DeprecationWarning)
 
-import core
-from core.parameterization import transformations, priors
+from . import core
+from .core.parameterization import transformations, priors
 constraints = transformations
-import models
-import mappings
-import inference
-import util
-import examples
-import likelihoods
-import testing
+from . import models
+from . import mappings
+from . import inference
+from . import util
+from . import examples
+from . import likelihoods
+from . import testing
 from numpy.testing import Tester
-import kern
-import plotting
+from . import kern
+from . import plotting
 
 # Direct imports for convenience:
-from core import Model
-from core.parameterization import Param, Parameterized, ObsAr
+from .core import Model
+from .core.parameterization import Param, Parameterized, ObsAr
 
 #@nottest
 try:
diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py
index ebed29bb..142eccbf 100644
--- a/GPy/core/__init__.py
+++ b/GPy/core/__init__.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from model import *
-from parameterization.parameterized import adjust_name_for_printing, Parameterizable
-from parameterization.param import Param, ParamConcatenation
-from parameterization.observable_array import ObsAr
+from .model import *
+from .parameterization.parameterized import adjust_name_for_printing, Parameterizable
+from .parameterization.param import Param, ParamConcatenation
+from .parameterization.observable_array import ObsAr
 
-from gp import GP
-from svgp import SVGP
-from sparse_gp import SparseGP
-from mapping import *
+from .gp import GP
+from .svgp import SVGP
+from .sparse_gp import SparseGP
+from .mapping import *
diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 05ce282c..bbd3939b 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -4,12 +4,14 @@
 import numpy as np
 import sys
 from .. import kern
-from model import Model
-from mapping import Mapping
-from parameterization import ObsAr
+from .model import Model
+from .parameterization import ObsAr
+from .model import Model
+from .mapping import Mapping
+from .parameterization import ObsAr
 from .. import likelihoods
 from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
-from parameterization.variational import VariationalPosterior
+from .parameterization.variational import VariationalPosterior
 
 import logging
 from GPy.util.normalizer import MeanNorm
@@ -92,7 +94,7 @@ class GP(Model):
                 inference_method = exact_gaussian_inference.ExactGaussianInference()
             else:
                 inference_method = expectation_propagation.EP()
-                print "defaulting to ", inference_method, "for latent function inference"
+                print("defaulting to ", inference_method, "for latent function inference")
         self.inference_method = inference_method
 
         logger.info("adding kernel and likelihood as parameters")
@@ -459,7 +461,7 @@ class GP(Model):
         try:
             super(GP, self).optimize(optimizer, start, **kwargs)
         except KeyboardInterrupt:
-            print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
+            print("KeyboardInterrupt caught, calling on_optimization_end() to round things up")
             self.inference_method.on_optimization_end()
             raise
 
diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py
index dd45a26e..30614384 100644
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@@ -3,7 +3,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import sys
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import numpy as np
 
 class Mapping(Parameterized):
diff --git a/GPy/core/model.py b/GPy/core/model.py
index 0251d58c..4108e72c 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -5,7 +5,7 @@
 from .. import likelihoods
 from ..inference import optimization
 from ..util.misc import opt_wrapper
-from parameterization import Parameterized
+from .parameterization import Parameterized
 import multiprocessing as mp
 import numpy as np
 from numpy.linalg.linalg import LinAlgError
@@ -13,6 +13,7 @@ import itertools
 import sys
 from .verbose_optimization import VerboseOptimization
 # import numdifftools as ndt
+from functools import reduce
 
 class Model(Parameterized):
     _fail_count = 0  # Count of failed optimization steps (see objective)
@@ -30,7 +31,7 @@ class Model(Parameterized):
         self.add_observer(self.tie, self.tie._parameters_changed_notification, priority=-500)
 
     def log_likelihood(self):
-        raise NotImplementedError, "this needs to be implemented to use the model class"
+        raise NotImplementedError("this needs to be implemented to use the model class")
     def _log_likelihood_gradients(self):
         return self.gradient.copy()
 
@@ -82,7 +83,7 @@ class Model(Parameterized):
                 pool.close()  # signal that no more data coming in
                 pool.join()  # wait for all the tasks to complete
             except KeyboardInterrupt:
-                print "Ctrl+c received, terminating and joining pool."
+                print("Ctrl+c received, terminating and joining pool.")
                 pool.terminate()
                 pool.join()
 
@@ -95,10 +96,10 @@ class Model(Parameterized):
                     self.optimization_runs.append(jobs[i].get())
 
                 if verbose:
-                    print("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt))
+                    print(("Optimization restart {0}/{1}, f = {2}".format(i + 1, num_restarts, self.optimization_runs[-1].f_opt)))
             except Exception as e:
                 if robust:
-                    print("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts))
+                    print(("Warning - optimization restart {0}/{1} failed".format(i + 1, num_restarts)))
                 else:
                     raise e
 
@@ -119,7 +120,7 @@ class Model(Parameterized):
 
         DEPRECATED.
         """
-        raise DeprecationWarning, 'parameters now have default constraints'
+        raise DeprecationWarning('parameters now have default constraints')
 
     def objective_function(self):
         """
@@ -237,10 +238,10 @@ class Model(Parameterized):
 
         """
         if self.is_fixed or self.size == 0:
-            print 'nothing to optimize'
+            print('nothing to optimize')
 
         if not self.update_model():
-            print "updates were off, setting updates on again"
+            print("updates were off, setting updates on again")
             self.update_model(True)
 
         if start == None:
@@ -305,7 +306,7 @@ class Model(Parameterized):
                     transformed_index = (indices - (~self._fixes_).cumsum())[transformed_index[which[0]]]
 
                 if transformed_index.size == 0:
-                    print "No free parameters to check"
+                    print("No free parameters to check")
                     return
 
             # just check the global ratio
@@ -340,9 +341,9 @@ class Model(Parameterized):
             cols.extend([max(float_len, len(header[i])) for i in range(1, len(header))])
             cols = np.array(cols) + 5
             header_string = ["{h:^{col}}".format(h=header[i], col=cols[i]) for i in range(len(cols))]
-            header_string = map(lambda x: '|'.join(x), [header_string])
+            header_string = list(map(lambda x: '|'.join(x), [header_string]))
             separator = '-' * len(header_string[0])
-            print '\n'.join([header_string[0], separator])
+            print('\n'.join([header_string[0], separator]))
             if target_param is None:
                 param_index = range(len(x))
                 transformed_index = param_index
@@ -358,13 +359,13 @@ class Model(Parameterized):
                     transformed_index = param_index
 
                 if param_index.size == 0:
-                    print "No free parameters to check"
+                    print("No free parameters to check")
                     return
 
             gradient = self._grads(x).copy()
             np.where(gradient == 0, 1e-312, gradient)
             ret = True
-            for nind, xind in itertools.izip(param_index, transformed_index):
+            for nind, xind in zip(param_index, transformed_index):
                 xx = x.copy()
                 xx[xind] += step
                 f1 = self._objective(xx)
@@ -392,7 +393,7 @@ class Model(Parameterized):
                 ng = '%.6f' % float(numerical_gradient)
                 df = '%1.e' % float(df_ratio)
                 grad_string = "{0:<{c0}}|{1:^{c1}}|{2:^{c2}}|{3:^{c3}}|{4:^{c4}}|{5:^{c5}}".format(formatted_name, r, d, g, ng, df, c0=cols[0] + 9, c1=cols[1], c2=cols[2], c3=cols[3], c4=cols[4], c5=cols[5])
-                print grad_string
+                print(grad_string)
 
             self.optimizer_array = x
             return ret
diff --git a/GPy/core/parameterization/__init__.py b/GPy/core/parameterization/__init__.py
index 8e9aa094..de736671 100644
--- a/GPy/core/parameterization/__init__.py
+++ b/GPy/core/parameterization/__init__.py
@@ -1,5 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from param import Param, ObsAr
-from parameterized import Parameterized
+from .param import Param, ObsAr
+from .parameterized import Parameterized
diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index 61c82da1..e4803f37 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -3,7 +3,8 @@
 
 import numpy
 from numpy.lib.function_base import vectorize
-from lists_and_dicts import IntArrayDict
+from .lists_and_dicts import IntArrayDict
+from functools import reduce
 
 def extract_properties_to_index(index, props):
     prop_index = dict()
@@ -62,12 +63,15 @@ class ParameterIndexOperations(object):
     def __init__(self, constraints=None):
         self._properties = IntArrayDict()
         if constraints is not None:
-            for t, i in constraints.iteritems():
+            #python 3 fix
+            #for t, i in constraints.iteritems():
+            for t, i in constraints.items():
                 self.add(t, i)
 
-    def iteritems(self):
-        return self._properties.iteritems()
-
+    #iteritems has gone in python 3
+    #def iteritems(self):
+    #    return self._properties.iteritems()
+        
     def items(self):
         return self._properties.items()
 
@@ -75,7 +79,7 @@ class ParameterIndexOperations(object):
         return self._properties.keys()
 
     def iterproperties(self):
-        return self._properties.iterkeys()
+        return iter(self._properties)
 
     def shift_right(self, start, size):
         for ind in self.iterindices():
@@ -83,7 +87,7 @@ class ParameterIndexOperations(object):
             ind[toshift] += size
 
     def shift_left(self, start, size):
-        for v, ind in self.items():
+        for v, ind in list(self.items()):
             todelete = (ind>=start) * (ind<start+size)
             if todelete.size != 0:
                 ind = ind[~todelete]
@@ -101,7 +105,11 @@ class ParameterIndexOperations(object):
         return reduce(lambda a,b: a+b.size, self.iterindices(), 0)
 
     def iterindices(self):
-        return self._properties.itervalues()
+        try:
+            return self._properties.itervalues()
+        except AttributeError:
+	#Changed this from itervalues to values for Py3 compatibility. It didn't break the test suite.
+            return self._properties.values()
 
     def indices(self):
         return self._properties.values()
@@ -150,14 +158,18 @@ class ParameterIndexOperations(object):
         return numpy.array([]).astype(int)
 
     def update(self, parameter_index_view, offset=0):
-        for i, v in parameter_index_view.iteritems():
+        #py3 fix
+        #for i, v in parameter_index_view.iteritems():
+        for i, v in parameter_index_view.items():
             self.add(i, v+offset)
 
     def copy(self):
         return self.__deepcopy__(None)
 
     def __deepcopy__(self, memo):
-        return ParameterIndexOperations(dict(self.iteritems()))
+        #py3 fix
+        #return ParameterIndexOperations(dict(self.iteritems()))
+        return ParameterIndexOperations(dict(self.items()))
 
     def __getitem__(self, prop):
         return self._properties[prop]
@@ -195,22 +207,26 @@ class ParameterIndexOperationsView(object):
     def _filter_index(self, ind):
         return ind[(ind >= self._offset) * (ind < (self._offset + self._size))] - self._offset
 
-
-    def iteritems(self):
-        for i, ind in self._param_index_ops.iteritems():
+    #iteritems has gone in python 3. It has been renamed items()
+    def items(self):
+        _items_list = list(self._param_index_ops.items())
+        for i, ind in _items_list:
             ind2 = self._filter_index(ind)
             if ind2.size > 0:
                 yield i, ind2
-
-    def items(self):
-        return [[i,v] for i,v in self.iteritems()]
+    
+    #Python 3 items() is now implemented as per py2 iteritems
+    #def items(self):
+    #    return [[i,v] for i,v in self.iteritems()]
 
     def properties(self):
         return [i for i in self.iterproperties()]
 
 
     def iterproperties(self):
-        for i, _ in self.iteritems():
+        #py3 fix
+        #for i, _ in self.iteritems():
+        for i, _ in self.items():
             yield i
 
 
@@ -230,7 +246,9 @@ class ParameterIndexOperationsView(object):
 
 
     def iterindices(self):
-        for _, ind in self.iteritems():
+        #py3 fix
+        #for _, ind in self.iteritems():
+        for _, ind in self.items():
             yield ind
 
 
@@ -286,10 +304,14 @@ class ParameterIndexOperationsView(object):
 
     def __str__(self, *args, **kwargs):
         import pprint
-        return pprint.pformat(dict(self.iteritems()))
+        #py3 fixes
+        #return pprint.pformat(dict(self.iteritems()))
+        return pprint.pformat(dict(self.items()))
 
     def update(self, parameter_index_view, offset=0):
-        for i, v in parameter_index_view.iteritems():
+        #py3 fixes
+        #for i, v in parameter_index_view.iteritems():
+        for i, v in parameter_index_view.items():
             self.add(i, v+offset)
 
 
@@ -297,6 +319,8 @@ class ParameterIndexOperationsView(object):
         return self.__deepcopy__(None)
 
     def __deepcopy__(self, memo):
-        return ParameterIndexOperations(dict(self.iteritems()))
+        #py3 fix
+        #return ParameterIndexOperations(dict(self.iteritems()))
+        return ParameterIndexOperations(dict(self.items()))
     pass
 
diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py
index 5afbb8ed..2d774a76 100644
--- a/GPy/core/parameterization/lists_and_dicts.py
+++ b/GPy/core/parameterization/lists_and_dicts.py
@@ -32,7 +32,7 @@ class ArrayList(list):
             if el is item:
                 return index
             index += 1
-        raise ValueError, "{} is not in list".format(item)
+        raise ValueError("{} is not in list".format(item))
     pass
 
 class ObserverList(object):
@@ -75,7 +75,7 @@ class ObserverList(object):
 
     def __str__(self):
         from . import ObsAr, Param
-        from parameter_core import Parameterizable
+        from .parameter_core import Parameterizable
         ret = []
         curr_p = None
         
diff --git a/GPy/core/parameterization/observable.py b/GPy/core/parameterization/observable.py
index 8a85c6ca..0836b5d6 100644
--- a/GPy/core/parameterization/observable.py
+++ b/GPy/core/parameterization/observable.py
@@ -12,7 +12,7 @@ class Observable(object):
     """
     def __init__(self, *args, **kwargs):
         super(Observable, self).__init__()
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
         self.observers = ObserverList()
         self._update_on = True
 
diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py
index 271fe7b9..c6fea497 100644
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@@ -3,8 +3,8 @@
 
 
 import numpy as np
-from parameter_core import Pickleable
-from observable import Observable
+from .parameter_core import Pickleable
+from .observable import Observable
 
 class ObsAr(np.ndarray, Pickleable, Observable):
     """
@@ -39,7 +39,7 @@ class ObsAr(np.ndarray, Pickleable, Observable):
         return self.view(np.ndarray)
 
     def copy(self):
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
         memo = {}
         memo[id(self)] = self
         memo[id(self.observers)] = ObserverList()
diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py
index 1246bc18..1838f2bf 100644
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@@ -4,8 +4,9 @@
 import itertools
 import numpy
 np = numpy
-from parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
-from observable_array import ObsAr
+from .parameter_core import Parameterizable, adjust_name_for_printing, Pickleable
+from .observable_array import ObsAr
+from functools import reduce
 
 ###### printing
 __constraints_name__ = "Constraint"
@@ -156,7 +157,7 @@ class Param(Parameterizable, ObsAr):
     #===========================================================================
     @property
     def is_fixed(self):
-        from transformations import __fixed__
+        from .transformations import __fixed__
         return self.constraints[__fixed__].size == self.size
 
     def _get_original(self, param):
@@ -207,10 +208,14 @@ class Param(Parameterizable, ObsAr):
         return 0
     @property
     def _constraints_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+        #py3 fix
+        #return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.iteritems()))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.constraints.items()))]
     @property
     def _priors_str(self):
-        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+        #py3 fix
+        #return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.iteritems()))]
+        return [' '.join(map(lambda c: str(c[0]) if c[1].size == self._realsize_ else "{" + str(c[0]) + "}", self.priors.items()))]
     @property
     def _ties_str(self):
         return ['']
@@ -279,7 +284,7 @@ class Param(Parameterizable, ObsAr):
 .tg th{font-family:"Courier New", Courier, monospace !important;font-weight:normal;color:#fff;background-color:#26ADE4;border-style:solid;border-width:1px;overflow:hidden;word-break:normal;border-color:#DCDCDC;}
 .tg .tg-left{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:left;}
 .tg .tg-right{font-family:"Courier New", Courier, monospace !important;font-weight:normal;text-align:right;}
-</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])
+</style>"""] + ['<table class="tg">'] + [header] + ["<tr><td class=tg-left>{i}</td><td  class=tg-right>{x}</td><td class=tg-left>{c}</td><td class=tg-left>{p}</td><td class=tg-left>{t}</td></tr>".format(x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in zip(indices, vals, constr_matrix, ties, prirs)] + ["</table>"])
 
     def __str__(self, constr_matrix=None, indices=None, prirs=None, ties=None, lc=None, lx=None, li=None, lp=None, lt=None, only_name=False):
         filter_ = self._current_slice_
@@ -300,7 +305,7 @@ class Param(Parameterizable, ObsAr):
         if only_name: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=sep*lc, i=sep*li, t=sep*lt, p=sep*lp)  # nice header for printing
         else: header = header_format.format(lc, lx, li, lt, lp, ' ', x=self.hierarchy_name(), c=__constraints_name__, i=__index_name__, t=__tie_name__, p=__priors_name__)  # nice header for printing
         if not ties: ties = itertools.cycle([''])
-        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in itertools.izip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
+        return "\n".join([header] + ["  {i!s:^{3}s}  |  {x: >{1}.{2}g}  |  {c:^{0}s}  |  {p:^{5}s}  |  {t:^{4}s}  ".format(lc, lx, __precision__, li, lt, lp, x=x, c=" ".join(map(str, c)), p=" ".join(map(str, p)), t=(t or ''), i=i) for i, x, c, t, p in zip(indices, vals, constr_matrix, ties, prirs)])  # return all the constraints with right indices
         # except: return super(Param, self).__str__()
 
 class ParamConcatenation(object):
@@ -313,7 +318,7 @@ class ParamConcatenation(object):
         See :py:class:`GPy.core.parameter.Param` for more details on constraining.
         """
         # self.params = params
-        from lists_and_dicts import ArrayList
+        from .lists_and_dicts import ArrayList
         self.params = ArrayList([])
         for p in params:
             for p in p.flattened_parameters:
@@ -336,7 +341,9 @@ class ParamConcatenation(object):
                     level += 1
                     parent = parent._parent_
         import operator
-        self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+        #py3 fix
+        #self.parents = map(lambda x: x[0], sorted(parents.iteritems(), key=operator.itemgetter(1)))
+        self.parents = map(lambda x: x[0], sorted(parents.items(), key=operator.itemgetter(1)))
     #===========================================================================
     # Get/set items, enable broadcasting
     #===========================================================================
@@ -429,14 +436,14 @@ class ParamConcatenation(object):
         params = self.params
         constr_matrices, ties_matrices, prior_matrices = zip(*map(f, params))
         indices = [p._indices() for p in params]
-        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in itertools.izip(params, constr_matrices)])
+        lc = max([p._max_len_names(cm, __constraints_name__) for p, cm in zip(params, constr_matrices)])
         lx = max([p._max_len_values() for p in params])
-        li = max([p._max_len_index(i) for p, i in itertools.izip(params, indices)])
-        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in itertools.izip(params, ties_matrices)])
-        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in itertools.izip(params, prior_matrices)])
+        li = max([p._max_len_index(i) for p, i in zip(params, indices)])
+        lt = max([p._max_len_names(tm, __tie_name__) for p, tm in zip(params, ties_matrices)])
+        lp = max([p._max_len_names(pm, __constraints_name__) for p, pm in zip(params, prior_matrices)])
         strings = []
         start = True
-        for p, cm, i, tm, pm in itertools.izip(params,constr_matrices,indices,ties_matrices,prior_matrices):
+        for p, cm, i, tm, pm in zip(params,constr_matrices,indices,ties_matrices,prior_matrices):
             strings.append(p.__str__(constr_matrix=cm, indices=i, prirs=pm, ties=tm, lc=lc, lx=lx, li=li, lp=lp, lt=lt, only_name=(1-start)))
             start = False
         return "\n".join(strings)
diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py
index dc083a98..1bc6a29e 100644
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@@ -13,11 +13,12 @@ Observable Pattern for patameterization
 
 """
 
-from transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
+from .transformations import Transformation,Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
 import numpy as np
 import re
 import logging
-from updateable import Updateable
+from .updateable import Updateable
+from functools import reduce
 
 class HierarchyError(Exception):
     """
@@ -36,7 +37,7 @@ def adjust_name_for_printing(name):
         name = name.replace("/", "_l_").replace("@", '_at_')
         name = name.replace("(", "_of_").replace(")", "")
         if re.match(r'^[a-zA-Z_][a-zA-Z0-9-_]*$', name) is None:
-            raise NameError, "name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name)
+            raise NameError("name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name))
         return name
     return ''
 
@@ -65,13 +66,13 @@ class Parentable(object):
         Gets called, when the parent changed, so we can adjust our
         inner attributes according to the new parent.
         """
-        raise NotImplementedError, "shouldnt happen, Parentable objects need to be able to change their parent"
+        raise NotImplementedError("shouldnt happen, Parentable objects need to be able to change their parent")
 
     def _disconnect_parent(self, *args, **kw):
         """
         Disconnect this object from its parent
         """
-        raise NotImplementedError, "Abstract superclass"
+        raise NotImplementedError("Abstract superclass")
 
     @property
     def _highest_parent_(self):
@@ -109,7 +110,10 @@ class Pickleable(object):
                   it properly.
         :param protocol: pickling protocol to use, python-pickle for details.
         """
-        import cPickle as pickle
+        try: #Py2
+            import cPickle as pickle
+        except ImportError: #Py3
+            import pickle
         if isinstance(f, str):
             with open(f, 'wb') as f:
                 pickle.dump(self, f, protocol)
@@ -138,9 +142,9 @@ class Pickleable(object):
             which = self
         which.traverse_parents(parents.append) # collect parents
         for p in parents:
-            if not memo.has_key(id(p)):memo[id(p)] = None # set all parents to be None, so they will not be copied
-        if not memo.has_key(id(self.gradient)):memo[id(self.gradient)] = None # reset the gradient
-        if not memo.has_key(id(self._fixes_)):memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
+            if not id(p) in memo :memo[id(p)] = None # set all parents to be None, so they will not be copied
+        if not id(self.gradient) in memo:memo[id(self.gradient)] = None # reset the gradient
+        if not id(self._fixes_) in memo :memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent
         copy = copy.deepcopy(self, memo) # and start the copy
         copy._parent_index_ = None
         copy._trigger_params_changed()
@@ -163,14 +167,16 @@ class Pickleable(object):
                        '_Cacher_wrap__cachers', # never pickle cachers
                        ]
         dc = dict()
-        for k,v in self.__dict__.iteritems():
+        #py3 fix
+        #for k,v in self.__dict__.iteritems():
+        for k,v in self.__dict__.items():
             if k not in ignore_list:
                 dc[k] = v
         return dc
 
     def __setstate__(self, state):
         self.__dict__.update(state)
-        from lists_and_dicts import ObserverList
+        from .lists_and_dicts import ObserverList
         self.observers = ObserverList()
         self._setup_observers()
         self._optimizer_copy_transformed = False
@@ -214,7 +220,7 @@ class Gradcheckable(Pickleable, Parentable):
         Perform the checkgrad on the model.
         TODO: this can be done more efficiently, when doing it inside here
         """
-        raise HierarchyError, "This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!"
+        raise HierarchyError("This parameter is not in a model with a likelihood, and, therefore, cannot be gradient checked!")
 
 class Nameable(Gradcheckable):
     """
@@ -268,7 +274,7 @@ class Indexable(Nameable, Updateable):
     def __init__(self, name, default_constraint=None, *a, **kw):
         super(Indexable, self).__init__(name=name, *a, **kw)
         self._default_constraint_ = default_constraint
-        from index_operations import ParameterIndexOperations
+        from .index_operations import ParameterIndexOperations
         self.constraints = ParameterIndexOperations()
         self.priors = ParameterIndexOperations()
         if self._default_constraint_ is not None:
@@ -310,7 +316,7 @@ class Indexable(Nameable, Updateable):
         that is an int array, containing the indexes for the flattened
         param inside this parameterized logic.
         """
-        from param import ParamConcatenation
+        from .param import ParamConcatenation
         if isinstance(param, ParamConcatenation):
             return np.hstack((self._raveled_index_for(p) for p in param.params))
         return param._raveled_index() + self._offset_for(param)
@@ -407,7 +413,7 @@ class Indexable(Nameable, Updateable):
         repriorized = self.unset_priors()
         self._add_to_index_operations(self.priors, repriorized, prior, warning)
 
-        from domains import _REAL, _POSITIVE, _NEGATIVE
+        from .domains import _REAL, _POSITIVE, _NEGATIVE
         if prior.domain is _POSITIVE:
             self.constrain_positive(warning)
         elif prior.domain is _NEGATIVE:
@@ -426,7 +432,9 @@ class Indexable(Nameable, Updateable):
         """evaluate the prior"""
         if self.priors.size > 0:
             x = self.param_array
-            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
+            #py3 fix
+            #return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.iteritems()), 0)
+            return reduce(lambda a, b: a + b, (p.lnpdf(x[ind]).sum() for p, ind in self.priors.items()), 0)
         return 0.
 
     def _log_prior_gradients(self):
@@ -434,7 +442,9 @@ class Indexable(Nameable, Updateable):
         if self.priors.size > 0:
             x = self.param_array
             ret = np.zeros(x.size)
-            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
+            #py3 fix
+            #[np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.iteritems()]
+            [np.put(ret, ind, p.lnpdf_grad(x[ind])) for p, ind in self.priors.items()]
             return ret
         return 0.
 
@@ -536,7 +546,7 @@ class Indexable(Nameable, Updateable):
         update the constraints and priors view, so that
         constraining is automized for the parent.
         """
-        from index_operations import ParameterIndexOperationsView
+        from .index_operations import ParameterIndexOperationsView
         #if getattr(self, "_in_init_"):
             #import ipdb;ipdb.set_trace()
             #self.constraints.update(param.constraints, start)
@@ -558,7 +568,7 @@ class Indexable(Nameable, Updateable):
         """
         if warning and reconstrained.size > 0:
             # TODO: figure out which parameters have changed and only print those
-            print "WARNING: reconstraining parameters {}".format(self.hierarchy_name() or self.name)
+            print("WARNING: reconstraining parameters {}".format(self.hierarchy_name() or self.name))
         index = self._raveled_index()
         which.add(what, index)
         return index
@@ -571,7 +581,7 @@ class Indexable(Nameable, Updateable):
         if len(transforms) == 0:
             transforms = which.properties()
         removed = np.empty((0,), dtype=int)
-        for t in transforms:
+        for t in list(transforms):
             unconstrained = which.remove(t, self._raveled_index())
             removed = np.union1d(removed, unconstrained)
             if t is __fixed__:
@@ -612,7 +622,9 @@ class OptimizationHandlable(Indexable):
 
         if not self._optimizer_copy_transformed:
             self._optimizer_copy_.flat = self.param_array.flat
-            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+            #py3 fix
+            #[np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__]
+            [np.put(self._optimizer_copy_, ind, c.finv(self.param_array[ind])) for c, ind in self.constraints.items() if c != __fixed__]
             if self.has_parent() and (self.constraints[__fixed__].size != 0 or self._has_ties()):
                 fixes = np.ones(self.size).astype(bool)
                 fixes[self.constraints[__fixed__]] = FIXED
@@ -641,21 +653,25 @@ class OptimizationHandlable(Indexable):
         if f is None:
             self.param_array.flat = p
             [np.put(self.param_array, ind, c.f(self.param_array.flat[ind]))
-             for c, ind in self.constraints.iteritems() if c != __fixed__]
+             #py3 fix
+             #for c, ind in self.constraints.iteritems() if c != __fixed__]
+             for c, ind in self.constraints.items() if c != __fixed__]
         else:
             self.param_array.flat[f] = p
             [np.put(self.param_array, ind[f[ind]], c.f(self.param_array.flat[ind[f[ind]]]))
-             for c, ind in self.constraints.iteritems() if c != __fixed__]
+             #py3 fix
+             #for c, ind in self.constraints.iteritems() if c != __fixed__]
+             for c, ind in self.constraints.items() if c != __fixed__]
         #self._highest_parent_.tie.propagate_val()
 
         self._optimizer_copy_transformed = False
         self.trigger_update()
 
     def _get_params_transformed(self):
-        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+        raise DeprecationWarning("_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!")
 #
     def _set_params_transformed(self, p):
-        raise DeprecationWarning, "_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!"
+        raise DeprecationWarning("_get|set_params{_optimizer_copy_transformed} is deprecated, use self.optimizer array insetad!")
 
     def _trigger_params_changed(self, trigger_parent=True):
         """
@@ -680,7 +696,9 @@ class OptimizationHandlable(Indexable):
         constraint to it.
         """
         self._highest_parent_.tie.collate_gradient()
-        [np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        #py3 fix
+        #[np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [np.put(g, i, c.gradfactor(self.param_array[i], g[i])) for c, i in self.constraints.items() if c != __fixed__]
         if self._has_fixes(): return g[self._fixes_]
         return g
 
@@ -690,7 +708,9 @@ class OptimizationHandlable(Indexable):
         constraint to it.
         """
         self._highest_parent_.tie.collate_gradient()
-        [np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        #py3 fix
+        #[np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.iteritems() if c != __fixed__]
+        [np.put(g, i, c.gradfactor_non_natural(self.param_array[i], g[i])) for c, i in self.constraints.items() if c != __fixed__]
         if self._has_fixes(): return g[self._fixes_]
         return g
 
@@ -701,7 +721,7 @@ class OptimizationHandlable(Indexable):
         Return the number of parameters of this parameter_handle.
         Param objects will always return 0.
         """
-        raise NotImplemented, "Abstract, please implement in respective classes"
+        raise NotImplemented("Abstract, please implement in respective classes")
 
     def parameter_names(self, add_self=False, adjust_for_printing=False, recursive=True):
         """
@@ -750,7 +770,9 @@ class OptimizationHandlable(Indexable):
         self.optimizer_array = x  # makes sure all of the tied parameters get the same init (since there's only one prior object...)
         # now draw from prior where possible
         x = self.param_array.copy()
-        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
+        #Py3 fix
+        #[np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.iteritems() if not p is None]
+        [np.put(x, ind, p.rvs(ind.size)) for p, ind in self.priors.items() if not p is None]
         unfixlist = np.ones((self.size,),dtype=np.bool)
         unfixlist[self.constraints[__fixed__]] = False
         self.param_array.flat[unfixlist] = x.view(np.ndarray).ravel()[unfixlist]
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index 44173f58..691bf4a7 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -1,12 +1,12 @@
 # Copyright (c) 2014, Max Zwiessele, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
+import six # For metaclass support in Python 2 and 3 simultaneously
 import numpy; np = numpy
 import itertools
 from re import compile, _pattern_type
-from param import ParamConcatenation
-from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
+from .param import ParamConcatenation
+from .parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
 
 import logging
 from GPy.core.parameterization.index_operations import ParameterIndexOperationsView
@@ -27,6 +27,7 @@ class ParametersChangedMeta(type):
         self.parameters_changed()
         return self
 
+@six.add_metaclass(ParametersChangedMeta)
 class Parameterized(Parameterizable):
     """
     Parameterized class
@@ -73,7 +74,9 @@ class Parameterized(Parameterizable):
     # Metaclass for parameters changed after init.
     # This makes sure, that parameters changed will always be called after __init__
     # **Never** call parameters_changed() yourself
-    __metaclass__ = ParametersChangedMeta
+    #This is ignored in Python 3 -- you need to put the meta class in the function definition. 
+    #__metaclass__ = ParametersChangedMeta
+    #The six module is used to support both Python 2 and 3 simultaneously
     #===========================================================================
     def __init__(self, name=None, parameters=[], *a, **kw):
         super(Parameterized, self).__init__(name=name, *a, **kw)
@@ -131,7 +134,7 @@ class Parameterized(Parameterizable):
             if param.has_parent():
                 def visit(parent, self):
                     if parent is self:
-                        raise HierarchyError, "You cannot add a parameter twice into the hierarchy"
+                        raise HierarchyError("You cannot add a parameter twice into the hierarchy")
                 param.traverse_parents(visit, self)
                 param._parent_.unlink_parameter(param)
             # make sure the size is set
@@ -173,7 +176,7 @@ class Parameterized(Parameterizable):
                 self._highest_parent_._connect_fixes()
 
         else:
-            raise HierarchyError, """Parameter exists already, try making a copy"""
+            raise HierarchyError("""Parameter exists already, try making a copy""")
 
 
     def link_parameters(self, *parameters):
@@ -189,9 +192,9 @@ class Parameterized(Parameterizable):
         """
         if not param in self.parameters:
             try:
-                raise RuntimeError, "{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name)
+                raise RuntimeError("{} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name))
             except AttributeError:
-                raise RuntimeError, "{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param))
+                raise RuntimeError("{} does not seem to be a parameter, remove parameters directly from their respective parents".format(str(param)))
 
         start = sum([p.size for p in self.parameters[:param._parent_index_]])
         self._remove_parameter_name(param)
@@ -215,9 +218,9 @@ class Parameterized(Parameterizable):
         self._highest_parent_._notify_parent_change()
 
     def add_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "add_parameter was renamed to link_parameter to avoid confusion of setting variables, use link_parameter instead"
+        raise DeprecationWarning("add_parameter was renamed to link_parameter to avoid confusion of setting variables, use link_parameter instead")
     def remove_parameter(self, *args, **kwargs):
-        raise DeprecationWarning, "remove_parameter was renamed to unlink_parameter to avoid confusion of setting variables, use unlink_parameter instead"
+        raise DeprecationWarning("remove_parameter was renamed to unlink_parameter to avoid confusion of setting variables, use unlink_parameter instead")
 
     def _connect_parameters(self, ignore_added_names=False):
         # connect parameterlist to this parameterized object
@@ -237,7 +240,7 @@ class Parameterized(Parameterizable):
         self._param_slices_ = []
         for i, p in enumerate(self.parameters):
             if not p.param_array.flags['C_CONTIGUOUS']:
-                raise ValueError, "This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS"
+                raise ValueError("This should not happen! Please write an email to the developers with the code, which reproduces this error. All parameter arrays must be C_CONTIGUOUS")
 
             p._parent_ = self
             p._parent_index_ = i
@@ -268,7 +271,7 @@ class Parameterized(Parameterizable):
         """
         if not isinstance(regexp, _pattern_type): regexp = compile(regexp)
         found_params = []
-        for n, p in itertools.izip(self.parameter_names(False, False, True), self.flattened_parameters):
+        for n, p in zip(self.parameter_names(False, False, True), self.flattened_parameters):
             if regexp.match(n) is not None:
                 found_params.append(p)
         return found_params
@@ -279,7 +282,7 @@ class Parameterized(Parameterizable):
         else:
             if paramlist is None:
                 paramlist = self.grep_param_names(name)
-            if len(paramlist) < 1: raise AttributeError, name
+            if len(paramlist) < 1: raise AttributeError(name)
             if len(paramlist) == 1:
                 if isinstance(paramlist[-1], Parameterized):
                     paramlist = paramlist[-1].flattened_parameters
@@ -295,7 +298,7 @@ class Parameterized(Parameterizable):
             try:
                 self.param_array[name] = value
             except:
-                raise ValueError, "Setting by slice or index only allowed with array-like"
+                raise ValueError("Setting by slice or index only allowed with array-like")
             self.trigger_update()
         else:
             try: param = self.__getitem__(name, paramlist)
@@ -325,7 +328,7 @@ class Parameterized(Parameterizable):
             self._notify_parent_change()
             self.parameters_changed()
         except Exception as e:
-            print "WARNING: caught exception {!s}, trying to continue".format(e)
+            print("WARNING: caught exception {!s}, trying to continue".format(e))
 
     def copy(self, memo=None):
         if memo is None:
@@ -379,7 +382,7 @@ class Parameterized(Parameterizable):
         pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
         format_spec = "<tr><td class=tg-left>{{name:<{0}s}}</td><td class=tg-right>{{desc:>{1}s}}</td><td class=tg-left>{{const:^{2}s}}</td><td class=tg-left>{{pri:^{3}s}}</td><td class=tg-left>{{t:^{4}s}}</td></tr>".format(nl, sl, cl, pl, tl)
         to_print = []
-        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+        for n, d, c, t, p in zip(names, desc, constrs, ts, prirs):
             to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
         sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
         if header:
@@ -414,7 +417,7 @@ class Parameterized(Parameterizable):
         pl = max([len(str(x)) if x else 0 for x in prirs + ["Prior"]])
         format_spec = "  \033[1m{{name:<{0}s}}\033[0;0m  |  {{desc:>{1}s}}  |  {{const:^{2}s}}  |  {{pri:^{3}s}}  |  {{t:^{4}s}}".format(nl, sl, cl, pl, tl)
         to_print = []
-        for n, d, c, t, p in itertools.izip(names, desc, constrs, ts, prirs):
+        for n, d, c, t, p in zip(names, desc, constrs, ts, prirs):
             to_print.append(format_spec.format(name=n, desc=d, const=c, t=t, pri=p))
         sep = '-' * (nl + sl + cl + + pl + tl + 8 * 2 + 3)
         if header:
diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index 20a78691..38cb0d19 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -5,7 +5,7 @@
 import numpy as np
 from scipy.special import gammaln, digamma
 from ...util.linalg import pdinv
-from domains import _REAL, _POSITIVE
+from .domains import _REAL, _POSITIVE
 import warnings
 import weakref
 
@@ -15,8 +15,12 @@ class Prior(object):
     _instance = None
     def __new__(cls, *args, **kwargs):
         if not cls._instance or cls._instance.__class__ is not cls:
-            cls._instance = super(Prior, cls).__new__(cls, *args, **kwargs)
-        return cls._instance
+                newfunc = super(Prior, cls).__new__
+                if newfunc is object.__new__:
+                    cls._instance = newfunc(cls)  
+                else:
+                    cls._instance = newfunc(cls, *args, **kwargs)
+                return cls._instance
 
     def pdf(self, x):
         return np.exp(self.lnpdf(x))
@@ -52,7 +56,11 @@ class Gaussian(Prior):
             for instance in cls._instances:
                 if instance().mu == mu and instance().sigma == sigma:
                     return instance()
-        o = super(Prior, cls).__new__(cls, mu, sigma)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)            
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -140,7 +148,11 @@ class LogGaussian(Gaussian):
             for instance in cls._instances:
                 if instance().mu == mu and instance().sigma == sigma:
                     return instance()
-        o = super(Prior, cls).__new__(cls, mu, sigma)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, mu, sigma)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -258,7 +270,11 @@ class Gamma(Prior):
             for instance in cls._instances:
                 if instance().a == a and instance().b == b:
                     return instance()
-        o = super(Prior, cls).__new__(cls, a, b)
+        newfunc = super(Prior, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, a, b)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
 
@@ -398,7 +414,7 @@ class DGPLVM_KFDA(Prior):
     def compute_cls(self, x):
         cls = {}
         # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in cls:
                 cls[class_label] = []
@@ -537,7 +553,7 @@ class DGPLVM(Prior):
     def compute_cls(self, x):
         cls = {}
         # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in cls:
                 cls[class_label] = []
@@ -549,14 +565,14 @@ class DGPLVM(Prior):
         M_i = np.zeros((self.classnum, self.dim))
         for i in cls:
             # Mean of each class
-	    class_i = cls[i]
+            class_i = cls[i]
             M_i[i] = np.mean(class_i, axis=0)
         return M_i
 
     # Adding data points as tuple to the dictionary so that we can access indices
     def compute_indices(self, x):
         data_idx = {}
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in data_idx:
                 data_idx[class_label] = []
@@ -575,7 +591,7 @@ class DGPLVM(Prior):
             else:
                 lst_idx = []
             # Here we put indices of each class in to the list called lst_idx_all
-            for m in xrange(len(data_idx[i])):
+            for m in range(len(data_idx[i])):
                 lst_idx.append(data_idx[i][m][0])
             lst_idx_all.append(lst_idx)
         return lst_idx_all
@@ -611,7 +627,7 @@ class DGPLVM(Prior):
             # pdb.set_trace()
             # Calculating Bi
             B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
-        for k in xrange(self.datanum):
+        for k in range(self.datanum):
             for i in data_idx:
                 N_i = float(len(data_idx[i]))
                 if k in lst_idx_all[i]:
@@ -663,7 +679,7 @@ class DGPLVM(Prior):
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -684,7 +700,7 @@ class DGPLVM(Prior):
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
@@ -742,7 +758,7 @@ class DGPLVM_T(Prior):
         self.datanum = lbl.shape[0]
         self.x_shape = x_shape
         self.dim = x_shape[1]
-	self.vec = vec
+        self.vec = vec
 
 
     def get_class_label(self, y):
@@ -756,7 +772,7 @@ class DGPLVM_T(Prior):
     def compute_cls(self, x):
         cls = {}
         # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in cls:
                 cls[class_label] = []
@@ -768,14 +784,14 @@ class DGPLVM_T(Prior):
         M_i = np.zeros((self.classnum, self.dim))
         for i in cls:
             # Mean of each class
-	    class_i = np.multiply(cls[i],vec)
+            class_i = np.multiply(cls[i],vec)
             M_i[i] = np.mean(class_i, axis=0)
         return M_i
 
     # Adding data points as tuple to the dictionary so that we can access indices
     def compute_indices(self, x):
         data_idx = {}
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in data_idx:
                 data_idx[class_label] = []
@@ -794,7 +810,7 @@ class DGPLVM_T(Prior):
             else:
                 lst_idx = []
             # Here we put indices of each class in to the list called lst_idx_all
-            for m in xrange(len(data_idx[i])):
+            for m in range(len(data_idx[i])):
                 lst_idx.append(data_idx[i][m][0])
             lst_idx_all.append(lst_idx)
         return lst_idx_all
@@ -830,7 +846,7 @@ class DGPLVM_T(Prior):
             # pdb.set_trace()
             # Calculating Bi
             B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
-        for k in xrange(self.datanum):
+        for k in range(self.datanum):
             for i in data_idx:
                 N_i = float(len(data_idx[i]))
                 if k in lst_idx_all[i]:
@@ -883,7 +899,7 @@ class DGPLVM_T(Prior):
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
 	#print 'SB_inv: ', Sb_inv_N
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
@@ -905,7 +921,7 @@ class DGPLVM_T(Prior):
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
 	#print 'SB_inv: ',Sb_inv_N
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
diff --git a/GPy/core/parameterization/ties_and_remappings.py b/GPy/core/parameterization/ties_and_remappings.py
index a81b8d61..527bc47c 100644
--- a/GPy/core/parameterization/ties_and_remappings.py
+++ b/GPy/core/parameterization/ties_and_remappings.py
@@ -2,8 +2,8 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from parameterized import Parameterized
-from param import Param
+from .parameterized import Parameterized
+from .param import Param
 
 class Remapping(Parameterized):
     def mapping(self):
@@ -98,7 +98,7 @@ class Tie(Parameterized):
             if np.all(self.label_buf[idx]==0):
                 # None of p has been tied before.
                 tie_idx = self._expandTieParam(1)
-                print tie_idx
+                print(tie_idx)
                 tie_id = self.label_buf.max()+1
                 self.label_buf[tie_idx] = tie_id
             else:
@@ -185,18 +185,18 @@ class Tie(Parameterized):
     def _check_change(self):
         changed = False
         if self.tied_param is not None:
-            for i in xrange(self.tied_param.size):
+            for i in range(self.tied_param.size):
                 b0 = self.label_buf==self.label_buf[self.buf_idx[i]]
                 b = self._highest_parent_.param_array[b0]!=self.tied_param[i]
                 if b.sum()==0:
-                    print 'XXX'
+                    print('XXX')
                     continue
                 elif b.sum()==1:
-                    print '!!!'
+                    print('!!!')
                     val = self._highest_parent_.param_array[b0][b][0]
                     self._highest_parent_.param_array[b0] = val
                 else:
-                    print '@@@'
+                    print('@@@')
                     self._highest_parent_.param_array[b0] = self.tied_param[i]
                 changed = True
         return changed
@@ -212,11 +212,11 @@ class Tie(Parameterized):
         if self.tied_param is not None:
             self.tied_param.gradient = 0.
             [np.put(self.tied_param.gradient, i, self._highest_parent_.gradient[self.label_buf==self.label_buf[self.buf_idx[i]]].sum()) 
-                for i in xrange(self.tied_param.size)]
+                for i in range(self.tied_param.size)]
     
     def propagate_val(self):
         if self.tied_param is not None:
-            for i in xrange(self.tied_param.size):
+            for i in range(self.tied_param.size):
                 self._highest_parent_.param_array[self.label_buf==self.label_buf[self.buf_idx[i]]] = self.tied_param[i]
 
 
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index d929b1d9..7e15cee9 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -3,7 +3,7 @@
 
 
 import numpy as np
-from domains import _POSITIVE,_NEGATIVE, _BOUNDED
+from .domains import _POSITIVE,_NEGATIVE, _BOUNDED
 import weakref
 
 import sys
@@ -72,7 +72,7 @@ class Logexp(Transformation):
         return np.einsum('i,i->i', df, np.where(f>_lim_val, 1., 1. - np.exp(-f)))
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve'
@@ -130,7 +130,7 @@ class NormalTheta(Transformation):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -177,7 +177,7 @@ class NormalNaturalAntti(NormalTheta):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -220,7 +220,7 @@ class NormalEta(Transformation):
 
     def initialize(self, f):
         if np.any(f[self.var_indices] < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
             f[self.var_indices] = np.abs(f[self.var_indices])
         return f
 
@@ -360,7 +360,7 @@ class LogexpNeg(Transformation):
         return np.einsum('i,i->i', df, np.where(f>_lim_val, -1, -1 + np.exp(-f)))
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve'
@@ -412,7 +412,7 @@ class LogexpClipped(Logexp):
         return np.einsum('i,i->i', df, gf) # np.where(f < self.lower, 0, gf)
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve_c'
@@ -428,7 +428,7 @@ class Exponent(Transformation):
         return np.einsum('i,i->i', df, f)
     def initialize(self, f):
         if np.any(f < 0.):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         return np.abs(f)
     def __str__(self):
         return '+ve'
@@ -468,7 +468,11 @@ class Logistic(Transformation):
             for instance in cls._instances:
                 if instance().lower == lower and instance().upper == upper:
                     return instance()
-        o = super(Transformation, cls).__new__(cls, lower, upper, *args, **kwargs)
+        newfunc = super(Transformation, cls).__new__
+        if newfunc is object.__new__:
+            o = newfunc(cls)  
+        else:
+            o = newfunc(cls, lower, upper, *args, **kwargs)
         cls._instances.append(weakref.ref(o))
         return cls._instances[-1]()
     def __init__(self, lower, upper):
@@ -486,7 +490,7 @@ class Logistic(Transformation):
         return np.einsum('i,i->i', df, (f - self.lower) * (self.upper - f) / self.difference)
     def initialize(self, f):
         if np.any(np.logical_or(f < self.lower, f > self.upper)):
-            print "Warning: changing parameters to satisfy constraints"
+            print("Warning: changing parameters to satisfy constraints")
         #return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(f * 0.), f)
         #FIXME: Max, zeros_like right?
         return np.where(np.logical_or(f < self.lower, f > self.upper), self.f(np.zeros_like(f)), f)
diff --git a/GPy/core/parameterization/updateable.py b/GPy/core/parameterization/updateable.py
index 379e92e1..07083ce0 100644
--- a/GPy/core/parameterization/updateable.py
+++ b/GPy/core/parameterization/updateable.py
@@ -3,7 +3,7 @@ Created on 11 Nov 2014
 
 @author: maxz
 '''
-from observable import Observable
+from .observable import Observable
 
 
 class Updateable(Observable):
@@ -35,7 +35,7 @@ class Updateable(Observable):
         self.trigger_update()
 
     def toggle_update(self):
-        print "deprecated: toggle_update was renamed to update_toggle for easier access"
+        print("deprecated: toggle_update was renamed to update_toggle for easier access")
         self.update_toggle()
     def update_toggle(self):
         self.update_model(not self.update_model())
diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py
index 0804091c..ab196b98 100644
--- a/GPy/core/parameterization/variational.py
+++ b/GPy/core/parameterization/variational.py
@@ -5,9 +5,9 @@ Created on 6 Nov 2013
 '''
 
 import numpy as np
-from parameterized import Parameterized
-from param import Param
-from transformations import Logexp, Logistic,__fixed__
+from .parameterized import Parameterized
+from .param import Param
+from .transformations import Logexp, Logistic,__fixed__
 from GPy.util.misc import param_to_array
 from GPy.util.caching import Cache_this
 
@@ -16,13 +16,13 @@ class VariationalPrior(Parameterized):
         super(VariationalPrior, self).__init__(name=name, **kw)
 
     def KL_divergence(self, variational_posterior):
-        raise NotImplementedError, "override this for variational inference of latent space"
+        raise NotImplementedError("override this for variational inference of latent space")
 
     def update_gradients_KL(self, variational_posterior):
         """
         updates the gradients for mean and variance **in place**
         """
-        raise NotImplementedError, "override this for variational inference of latent space"
+        raise NotImplementedError("override this for variational inference of latent space")
 
 class NormalPrior(VariationalPrior):
     def KL_divergence(self, variational_posterior):
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index a81b77fa..35644bfe 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -2,11 +2,11 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from gp import GP
-from parameterization.param import Param
+from .gp import GP
+from .parameterization.param import Param
 from ..inference.latent_function_inference import var_dtc
 from .. import likelihoods
-from parameterization.variational import VariationalPosterior, NormalPosterior
+from .parameterization.variational import VariationalPosterior, NormalPosterior
 from ..util.linalg import mdot
 
 import logging
@@ -47,8 +47,8 @@ class SparseGP(GP):
                 inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
             else:
                 #inference_method = ??
-                raise NotImplementedError, "what to do what to do?"
-            print "defaulting to ", inference_method, "for latent function inference"
+                raise NotImplementedError("what to do what to do?")
+            print("defaulting to ", inference_method, "for latent function inference")
 
         self.Z = Param('inducing inputs', Z)
         self.num_inducing = Z.shape[0]
diff --git a/GPy/core/sparse_gp_mpi.py b/GPy/core/sparse_gp_mpi.py
index 15d3ad76..28de3124 100644
--- a/GPy/core/sparse_gp_mpi.py
+++ b/GPy/core/sparse_gp_mpi.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from sparse_gp import SparseGP
+from .sparse_gp import SparseGP
 from numpy.linalg.linalg import LinAlgError
 from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
 
@@ -56,7 +56,7 @@ class SparseGP_MPI(SparseGP):
             self.N_range = (N_start, N_end)
             self.N_list = np.array(N_list)
             self.Y_local = self.Y[N_start:N_end]
-            print 'MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range)
+            print('MPI RANK '+str(self.mpi_comm.rank)+' with the data range '+str(self.N_range))
             mpi_comm.Bcast(self.param_array, root=0)
         self.update_model(True)
 
diff --git a/GPy/core/svgp.py b/GPy/core/svgp.py
index 7783f3b1..fd48a7ab 100644
--- a/GPy/core/svgp.py
+++ b/GPy/core/svgp.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 from ..util import choleskies
-from sparse_gp import SparseGP
-from parameterization.param import Param
+from .sparse_gp import SparseGP
+from .parameterization.param import Param
 from ..inference.latent_function_inference import SVGP as svgp_inf
 
 
diff --git a/GPy/core/symbolic.py b/GPy/core/symbolic.py
index ed3a9d59..4a9fcb76 100644
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@@ -223,7 +223,7 @@ class Symbolic_core():
 
     def code_gradients_cacheable(self, function, variable):
         if variable not in self.cacheable:
-            raise RuntimeError, variable + ' must be a cacheable.'
+            raise RuntimeError(variable + ' must be a cacheable.')
         lcode = 'gradients_' + variable + ' = np.zeros_like(' + variable + ')\n'
         lcode += 'self.update_cache(' + ', '.join(self.cacheable) + ')\n'
         for i, theta in enumerate(self.variables[variable]):
diff --git a/GPy/core/verbose_optimization.py b/GPy/core/verbose_optimization.py
index 54e650c3..a5fb019e 100644
--- a/GPy/core/verbose_optimization.py
+++ b/GPy/core/verbose_optimization.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, Max Zwiessele.
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-
+from __future__ import print_function
 import numpy as np
 import sys
 import time
@@ -73,8 +73,8 @@ class VerboseOptimization(object):
                 #self.progress.add_class('box-flex1')
             else:
                 self.exps = exponents(self.fnow, self.current_gradient)
-                print 'Running {} Code:'.format(self.opt_name)
-                print ' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters)
+                print('Running {} Code:'.format(self.opt_name))
+                print(' {3:7s}   {0:{mi}s}   {1:11s}    {2:11s}'.format("i", "f", "|g|", "secs", mi=self.len_maxiters))
 
     def __enter__(self):
         self.start = time.time()
@@ -116,11 +116,11 @@ class VerboseOptimization(object):
                 b = np.any(n_exps < self.exps)
                 if a or b:
                     self.p_iter = self.iteration
-                    print ''
+                    print('')
                 if b:
                     self.exps = n_exps
-            print '\r',
-            print '{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+            print('\r', end=' ')
+            print('{3:> 7.2g}  {0:>0{mi}g}  {1:> 12e}  {2:> 12e}'.format(self.iteration, float(self.fnow), float(self.current_gradient), time.time()-self.start, mi=self.len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
             sys.stdout.flush()
 
     def print_status(self, me, which=None):
@@ -149,9 +149,9 @@ class VerboseOptimization(object):
             self.print_out()
 
             if not self.ipython_notebook:
-                print ''
-                print 'Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start)
-                print 'Optimization status: {0:s}'.format(self.status)
-                print
+                print()
+                print('Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start))
+                print('Optimization status: {0}'.format(self.status))             
+                print()
             elif self.clear:
                 self.hor_align.close()
diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py
index 968333e0..4e9e984e 100644
--- a/GPy/examples/__init__.py
+++ b/GPy/examples/__init__.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-import classification
-import regression
-import dimensionality_reduction
-import non_gaussian
+from . import classification
+from . import regression
+from . import dimensionality_reduction
+from . import non_gaussian
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index b3780073..d4518f24 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -15,7 +15,7 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
 
     """
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.oil()
     X = data['X']
     Xtest = data['Xtest']
@@ -52,7 +52,7 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
     """
 
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
     Y[Y.flatten() == -1] = 0
@@ -75,7 +75,7 @@ def toy_linear_1d_classification(seed=default_seed, optimize=True, plot=True):
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
 
-    print m
+    print(m)
     return m
 
 def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=True):
@@ -88,7 +88,7 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
     """
 
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
     Y[Y.flatten() == -1] = 0
@@ -114,7 +114,7 @@ def toy_linear_1d_classification_laplace(seed=default_seed, optimize=True, plot=
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
 
-    print m
+    print(m)
     return m
 
 def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, optimize=True, plot=True):
@@ -127,7 +127,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
     """
 
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
     Y[Y.flatten() == -1] = 0
@@ -147,7 +147,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
 
-    print m
+    print(m)
     return m
 
 def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
@@ -160,7 +160,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
     """
 
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.toy_linear_1d_classification(seed=seed)
     Y = data['Y'][:, 0:1]
     Y[Y.flatten() == -1] = 0
@@ -177,7 +177,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
         # Parameters optimization:
         for _ in range(5):
             m.optimize(max_iters=int(max_iters/5))
-        print m
+        print(m)
 
     # Plot
     if plot:
@@ -186,7 +186,7 @@ def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
         m.plot_f(ax=axes[0])
         m.plot(ax=axes[1])
 
-    print m
+    print(m)
     return m
 
 def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None, optimize=True, plot=True):
@@ -202,7 +202,7 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
     :type kernel: a GPy kernel
     """
     try:import pods
-    except ImportError:print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+    except ImportError:print('pods unavailable, see https://github.com/sods/ods for example datasets')
     data = pods.datasets.crescent_data(seed=seed)
     Y = data['Y']
     Y[Y.flatten()==-1] = 0
@@ -224,5 +224,5 @@ def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=
     if plot:
         m.plot()
 
-    print m
+    print(m)
     return m
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index df9093a2..46107a71 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -333,7 +333,7 @@ def bgplvm_simulation(optimize=True, verbose=1,
     m.likelihood.variance = .1
 
     if optimize:
-        print "Optimizing model:"
+        print("Optimizing model:")
         m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
@@ -358,7 +358,7 @@ def ssgplvm_simulation(optimize=True, verbose=1,
     m.likelihood.variance = .1
 
     if optimize:
-        print "Optimizing model:"
+        print("Optimizing model:")
         m.optimize('scg', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
@@ -388,7 +388,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
     m.Yreal = Y
 
     if optimize:
-        print "Optimizing model:"
+        print("Optimizing model:")
         m.optimize('bfgs', messages=verbose, max_iters=max_iters,
                    gtol=.05)
     if plot:
@@ -411,7 +411,7 @@ def mrd_simulation(optimize=True, verbose=True, plot=True, plot_sim=True, **kw):
     m['.*noise'] = [Y.var() / 40. for Y in Ylist]
 
     if optimize:
-        print "Optimizing Model:"
+        print("Optimizing Model:")
         m.optimize(messages=verbose, max_iters=8e3)
     if plot:
         m.X.plot("MRD Latent Space 1D")
@@ -439,7 +439,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim
             initx="random", initz='permute', **kw)
 
     if optimize:
-        print "Optimizing Model:"
+        print("Optimizing Model:")
         m.optimize('bfgs', messages=verbose, max_iters=8e3, gtol=.1)
     if plot:
         m.X.plot("MRD Latent Space 1D")
@@ -603,7 +603,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
     try:
         if optimize: m.optimize('bfgs', messages=verbose, max_iters=5e3, bfgs_factor=10)
     except KeyboardInterrupt:
-        print "Keyboard interrupt, continuing to plot and return"
+        print("Keyboard interrupt, continuing to plot and return")
 
     if plot:
         fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
@@ -653,7 +653,7 @@ def ssgplvm_simulation_linear():
     def sample_X(Q, pi):
         x = np.empty(Q)
         dies = np.random.rand(Q)
-        for q in xrange(Q):
+        for q in range(Q):
             if dies[q] < pi:
                 x[q] = np.random.randn()
             else:
@@ -663,7 +663,7 @@ def ssgplvm_simulation_linear():
     Y = np.empty((N, D))
     X = np.empty((N, Q))
     # Generate data from random sampled weight matrices
-    for n in xrange(N):
+    for n in range(N):
         X[n] = sample_X(Q, pi)
         w = np.random.randn(D, Q)
         Y[n] = np.dot(w, X[n])
diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py
index ddac8813..3652b4d3 100644
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@@ -37,7 +37,7 @@ def student_t_approx(optimize=True, plot=True):
 
     #Add student t random noise to datapoints
     deg_free = 1
-    print "Real noise: ", real_std
+    print("Real noise: ", real_std)
     initial_var_guess = 0.5
     edited_real_sd = initial_var_guess
 
@@ -73,7 +73,7 @@ def student_t_approx(optimize=True, plot=True):
     m4['.*t_scale2'].constrain_bounded(1e-6, 10.)
     m4['.*white'].constrain_fixed(1e-5)
     m4.randomize()
-    print m4
+    print(m4)
     debug=True
     if debug:
         m4.optimize(messages=1)
@@ -81,18 +81,18 @@ def student_t_approx(optimize=True, plot=True):
         pb.plot(m4.X, m4.inference_method.f_hat)
         pb.plot(m4.X, m4.Y, 'rx')
         m4.plot()
-        print m4
+        print(m4)
         return m4
 
     if optimize:
         optimizer='scg'
-        print "Clean Gaussian"
+        print("Clean Gaussian")
         m1.optimize(optimizer, messages=1)
-        print "Corrupt Gaussian"
+        print("Corrupt Gaussian")
         m2.optimize(optimizer, messages=1)
-        print "Clean student t"
+        print("Clean student t")
         m3.optimize(optimizer, messages=1)
-        print "Corrupt student t"
+        print("Corrupt student t")
         m4.optimize(optimizer, messages=1)
 
     if plot:
@@ -151,7 +151,7 @@ def boston_example(optimize=True, plot=True):
 
     for n, (train, test) in enumerate(kf):
         X_train, X_test, Y_train, Y_test = X[train], X[test], Y[train], Y[test]
-        print "Fold {}".format(n)
+        print("Fold {}".format(n))
 
         noise = 1e-1 #np.exp(-2)
         rbf_len = 0.5
@@ -163,21 +163,21 @@ def boston_example(optimize=True, plot=True):
         score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
 
         #Gaussian GP
-        print "Gauss GP"
+        print("Gauss GP")
         mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
         mgp.constrain_fixed('.*white', 1e-5)
         mgp['.*len'] = rbf_len
         mgp['.*noise'] = noise
-        print mgp
+        print(mgp)
         if optimize:
             mgp.optimize(optimizer=optimizer, messages=messages)
         Y_test_pred = mgp.predict(X_test)
         score_folds[1, n] = rmse(Y_test, Y_test_pred[0])
         pred_density[1, n] = np.mean(mgp.log_predictive_density(X_test, Y_test))
-        print mgp
-        print pred_density
+        print(mgp)
+        print(pred_density)
 
-        print "Gaussian Laplace GP"
+        print("Gaussian Laplace GP")
         N, D = Y_train.shape
         g_distribution = GPy.likelihoods.noise_model_constructors.gaussian(variance=noise, N=N, D=D)
         g_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), g_distribution)
@@ -186,18 +186,18 @@ def boston_example(optimize=True, plot=True):
         mg.constrain_fixed('.*white', 1e-5)
         mg['rbf_len'] = rbf_len
         mg['noise'] = noise
-        print mg
+        print(mg)
         if optimize:
             mg.optimize(optimizer=optimizer, messages=messages)
         Y_test_pred = mg.predict(X_test)
         score_folds[2, n] = rmse(Y_test, Y_test_pred[0])
         pred_density[2, n] = np.mean(mg.log_predictive_density(X_test, Y_test))
-        print pred_density
-        print mg
+        print(pred_density)
+        print(mg)
 
         for stu_num, df in enumerate(degrees_freedoms):
             #Student T
-            print "Student-T GP {}df".format(df)
+            print("Student-T GP {}df".format(df))
             t_distribution = GPy.likelihoods.noise_model_constructors.student_t(deg_free=df, sigma2=noise)
             stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
             mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
@@ -205,14 +205,14 @@ def boston_example(optimize=True, plot=True):
             mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000)
             mstu_t['rbf_len'] = rbf_len
             mstu_t['.*t_scale2'] = noise
-            print mstu_t
+            print(mstu_t)
             if optimize:
                 mstu_t.optimize(optimizer=optimizer, messages=messages)
             Y_test_pred = mstu_t.predict(X_test)
             score_folds[3+stu_num, n] = rmse(Y_test, Y_test_pred[0])
             pred_density[3+stu_num, n] = np.mean(mstu_t.log_predictive_density(X_test, Y_test))
-            print pred_density
-            print mstu_t
+            print(pred_density)
+            print(mstu_t)
 
     if plot:
         plt.figure()
@@ -230,8 +230,8 @@ def boston_example(optimize=True, plot=True):
         plt.scatter(X_test[:, data_axis_plot], Y_test, c='r', marker='x')
         plt.title('Stu t {}df'.format(df))
 
-    print "Average scores: {}".format(np.mean(score_folds, 1))
-    print "Average pred density: {}".format(np.mean(pred_density, 1))
+    print("Average scores: {}".format(np.mean(score_folds, 1)))
+    print("Average pred density: {}".format(np.mean(pred_density, 1)))
 
     if plot:
         #Plotting
diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index 0e68d0bf..267c6d1e 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -15,7 +15,7 @@ def olympic_marathon_men(optimize=True, plot=True):
     """Run a standard Gaussian process regression on the Olympic marathon data."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.olympic_marathon_men()
 
@@ -88,7 +88,7 @@ def epomeo_gpx(max_iters=200, optimize=True, plot=True):
     """
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.epomeo_gpx()
     num_data_list = []
@@ -135,7 +135,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
 
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
     # data['Y'] = data['Y'][0::2, :]
@@ -219,7 +219,7 @@ def olympic_100m_men(optimize=True, plot=True):
     """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.olympic_100m_men()
 
@@ -240,7 +240,7 @@ def toy_rbf_1d(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.toy_rbf_1d()
 
@@ -258,7 +258,7 @@ def toy_rbf_1d_50(optimize=True, plot=True):
     """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.toy_rbf_1d_50()
 
@@ -377,7 +377,7 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
     """Predict the location of a robot given wirelss signal strength readings."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.robot_wireless()
 
@@ -398,14 +398,14 @@ def robot_wireless(max_iters=100, kernel=None, optimize=True, plot=True):
 
     sse = ((data['Xtest'] - Xpredict)**2).sum()
 
-    print('Sum of squares error on test data: ' + str(sse))
+    print(('Sum of squares error on test data: ' + str(sse)))
     return m
 
 def silhouette(max_iters=100, optimize=True, plot=True):
     """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
     try:import pods
     except ImportError:
-        print 'pods unavailable, see https://github.com/sods/ods for example datasets'
+        print('pods unavailable, see https://github.com/sods/ods for example datasets')
         return
     data = pods.datasets.silhouette()
 
@@ -416,7 +416,7 @@ def silhouette(max_iters=100, optimize=True, plot=True):
     if optimize:
         m.optimize(messages=True, max_iters=max_iters)
 
-    print m
+    print(m)
     return m
 
 def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100, optimize=True, plot=True, checkgrad=False):
@@ -468,7 +468,7 @@ def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100, opt
     if plot:
         m.plot()
 
-    print m
+    print(m)
     return m
 
 def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
@@ -492,7 +492,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
     if plot:
         m.plot(ax=axes[0])
         axes[0].set_title('no input uncertainty')
-    print m
+    print(m)
 
     # the same Model with uncertainty
     m = GPy.models.SparseGPRegression(X, Y, kernel=GPy.kern.RBF(1), Z=Z, X_variance=S)
@@ -503,7 +503,7 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
         axes[1].set_title('with input uncertainty')
         fig.canvas.draw()
 
-    print m
+    print(m)
     return m
 
 def simple_mean_function(max_iters=100, optimize=True, plot=True):
diff --git a/GPy/inference/__init__.py b/GPy/inference/__init__.py
index 7b1307e3..c5044582 100644
--- a/GPy/inference/__init__.py
+++ b/GPy/inference/__init__.py
@@ -1,3 +1,3 @@
-import latent_function_inference
-import optimization
-import mcmc
+from . import latent_function_inference
+from . import optimization
+from . import mcmc
diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py
index dc7789ba..6754000d 100644
--- a/GPy/inference/latent_function_inference/__init__.py
+++ b/GPy/inference/latent_function_inference/__init__.py
@@ -61,15 +61,15 @@ class InferenceMethodList(LatentFunctionInference, list):
         for inf in state:
             self.append(inf)
 
-from exact_gaussian_inference import ExactGaussianInference
-from laplace import Laplace, LaplaceBlock
+from .exact_gaussian_inference import ExactGaussianInference
+from .laplace import Laplace,LaplaceBlock
 from GPy.inference.latent_function_inference.var_dtc import VarDTC
-from expectation_propagation import EP
-from expectation_propagation_dtc import EPDTC
-from dtc import DTC
-from fitc import FITC
-from var_dtc_parallel import VarDTC_minibatch
-from svgp import SVGP
+from .expectation_propagation import EP
+from .expectation_propagation_dtc import EPDTC
+from .dtc import DTC
+from .fitc import FITC
+from .var_dtc_parallel import VarDTC_minibatch
+from .svgp import SVGP
 
 # class FullLatentFunctionData(object):
 #
diff --git a/GPy/inference/latent_function_inference/dtc.py b/GPy/inference/latent_function_inference/dtc.py
index a12726e2..0aa990c1 100644
--- a/GPy/inference/latent_function_inference/dtc.py
+++ b/GPy/inference/latent_function_inference/dtc.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 import numpy as np
 from . import LatentFunctionInference
@@ -30,7 +30,7 @@ class DTC(LatentFunctionInference):
         #make sure the noise is not hetero
         beta = 1./likelihood.gaussian_variance(Y_metadata)
         if beta.size > 1:
-            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+            raise NotImplementedError("no hetero noise with this implementation of DTC")
 
         Kmm = kern.K(Z)
         Knn = kern.Kdiag(X)
@@ -99,7 +99,7 @@ class vDTC(object):
         #make sure the noise is not hetero
         beta = 1./likelihood.gaussian_variance(Y_metadata)
         if beta.size > 1:
-            raise NotImplementedError, "no hetero noise with this implementation of DTC"
+            raise NotImplementedError("no hetero noise with this implementation of DTC")
 
         Kmm = kern.K(Z)
         Knn = kern.Kdiag(X)
diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
index b2f1b7d0..2a0a2592 100644
--- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py
+++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import pdinv, dpotrs, tdot
 from ...util import diag
 import numpy as np
diff --git a/GPy/inference/latent_function_inference/expectation_propagation.py b/GPy/inference/latent_function_inference/expectation_propagation.py
index e09cf6d0..85841a33 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ...util.linalg import pdinv,jitchol,DSYR,tdot,dtrtrs, dpotrs
-from posterior import Posterior
+from .posterior import Posterior
 from . import LatentFunctionInference
 log_2_pi = np.log(2*np.pi)
 
diff --git a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
index 466cbbb2..e182c9f7 100644
--- a/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
+++ b/GPy/inference/latent_function_inference/expectation_propagation_dtc.py
@@ -6,7 +6,7 @@ from ...util import diag
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify, DSYR
 from ...core.parameterization.variational import VariationalPosterior
 from . import LatentFunctionInference
-from posterior import Posterior
+from .posterior import Posterior
 log_2_pi = np.log(2*np.pi)
 
 class EPDTC(LatentFunctionInference):
@@ -180,7 +180,7 @@ class EPDTC(LatentFunctionInference):
         if VVT_factor.shape[1] == Y.shape[1]:
             woodbury_vector = Cpsi1Vf # == Cpsi1V
         else:
-            print 'foobar'
+            print('foobar')
             psi1V = np.dot(mu_tilde[:,None].T*beta, psi1).T
             tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
             tmp, _ = dpotrs(LB, tmp, lower=1)
@@ -315,7 +315,7 @@ def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf,
         dL_dR = None
     elif het_noise:
         if uncertain_inputs:
-            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+            raise NotImplementedError("heteroscedatic derivates with uncertain inputs not implemented")
         else:
             #from ...util.linalg import chol_inv
             #LBi = chol_inv(LB)
diff --git a/GPy/inference/latent_function_inference/fitc.py b/GPy/inference/latent_function_inference/fitc.py
index f99b35ff..f38eb52b 100644
--- a/GPy/inference/latent_function_inference/fitc.py
+++ b/GPy/inference/latent_function_inference/fitc.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, tdot, dtrtrs, dpotri, pdinv
 from ...util import diag
 import numpy as np
@@ -27,7 +27,7 @@ class FITC(LatentFunctionInference):
         #make sure the noise is not hetero
         sigma_n = likelihood.gaussian_variance(Y_metadata)
         if sigma_n.size >1:
-            raise NotImplementedError, "no hetero noise with this implementation of FITC"
+            raise NotImplementedError("no hetero noise with this implementation of FITC")
 
         Kmm = kern.K(Z)
         Knn = kern.Kdiag(X)
diff --git a/GPy/inference/latent_function_inference/laplace.py b/GPy/inference/latent_function_inference/laplace.py
index 862eff2a..c6921f57 100644
--- a/GPy/inference/latent_function_inference/laplace.py
+++ b/GPy/inference/latent_function_inference/laplace.py
@@ -12,7 +12,7 @@
 
 import numpy as np
 from ...util.linalg import mdot, jitchol, dpotrs, dtrtrs, dpotri, symmetrify, pdinv
-from posterior import Posterior
+from .posterior import Posterior
 import warnings
 def warning_on_one_line(message, category, filename, lineno, file=None, line=None):
     return ' %s:%s: %s:%s\n' % (filename, lineno, category.__name__, message)
diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py
index a1d42c74..fbd72f57 100644
--- a/GPy/inference/latent_function_inference/posterior.py
+++ b/GPy/inference/latent_function_inference/posterior.py
@@ -52,7 +52,7 @@ class Posterior(object):
                 or ((mean is not None) and (cov is not None)):
             pass # we have sufficient to compute the posterior
         else:
-            raise ValueError, "insufficient information to compute the posterior"
+            raise ValueError("insufficient information to compute the posterior")
 
         self._K_chol = K_chol
         self._K = K
@@ -108,7 +108,7 @@ class Posterior(object):
         if self._precision is None:
             cov = np.atleast_3d(self.covariance)
             self._precision = np.zeros(cov.shape) # if one covariance per dimension
-            for p in xrange(cov.shape[-1]):
+            for p in range(cov.shape[-1]):
                 self._precision[:,:,p] = pdinv(cov[:,:,p])[0]
         return self._precision
 
@@ -126,7 +126,7 @@ class Posterior(object):
             if self._woodbury_inv is not None:
                 winv = np.atleast_3d(self._woodbury_inv)
                 self._woodbury_chol = np.zeros(winv.shape)
-                for p in xrange(winv.shape[-1]):
+                for p in range(winv.shape[-1]):
                     self._woodbury_chol[:,:,p] = pdinv(winv[:,:,p])[2]
                 #Li = jitchol(self._woodbury_inv)
                 #self._woodbury_chol, _ = dtrtri(Li)
@@ -135,13 +135,13 @@ class Posterior(object):
                 #self._woodbury_chol = jitchol(W)
             #try computing woodbury chol from cov
             elif self._covariance is not None:
-                raise NotImplementedError, "TODO: check code here"
+                raise NotImplementedError("TODO: check code here")
                 B = self._K - self._covariance
                 tmp, _ = dpotrs(self.K_chol, B)
                 self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)
                 _, _, self._woodbury_chol, _ = pdinv(self._woodbury_inv)
             else:
-                raise ValueError, "insufficient information to compute posterior"
+                raise ValueError("insufficient information to compute posterior")
         return self._woodbury_chol
 
     @property
@@ -161,7 +161,7 @@ class Posterior(object):
             elif self._covariance is not None:
                 B = np.atleast_3d(self._K) - np.atleast_3d(self._covariance)
                 self._woodbury_inv = np.empty_like(B)
-                for i in xrange(B.shape[-1]):
+                for i in range(B.shape[-1]):
                     tmp, _ = dpotrs(self.K_chol, B[:,:,i])
                     self._woodbury_inv[:,:,i], _ = dpotrs(self.K_chol, tmp.T)
         return self._woodbury_inv
diff --git a/GPy/inference/latent_function_inference/svgp.py b/GPy/inference/latent_function_inference/svgp.py
index 2215356e..bad73a71 100644
--- a/GPy/inference/latent_function_inference/svgp.py
+++ b/GPy/inference/latent_function_inference/svgp.py
@@ -2,7 +2,7 @@ from . import LatentFunctionInference
 from ...util import linalg
 from ...util import choleskies
 import numpy as np
-from posterior import Posterior
+from .posterior import Posterior
 
 class SVGP(LatentFunctionInference):
 
diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py
index 9c4d51bb..97d8dfe3 100644
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import mdot, jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, dpotri, dpotrs, symmetrify
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
@@ -170,7 +170,7 @@ class VarDTC(LatentFunctionInference):
         if VVT_factor.shape[1] == Y.shape[1]:
             woodbury_vector = Cpsi1Vf # == Cpsi1V
         else:
-            print 'foobar'
+            print('foobar')
             import ipdb; ipdb.set_trace()
             psi1V = np.dot(Y.T*beta, psi1).T
             tmp, _ = dtrtrs(Lm, psi1V, lower=1, trans=0)
@@ -213,7 +213,7 @@ def _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf,
         dL_dR = None
     elif het_noise:
         if uncertain_inputs:
-            raise NotImplementedError, "heteroscedatic derivates with uncertain inputs not implemented"
+            raise NotImplementedError("heteroscedatic derivates with uncertain inputs not implemented")
         else:
             #from ...util.linalg import chol_inv
             #LBi = chol_inv(LB)
diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py
index 2e633e16..4b884d4c 100644
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2014, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from posterior import Posterior
+from .posterior import Posterior
 from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv
 from ...util import diag
 from ...core.parameterization.variational import VariationalPosterior
@@ -92,7 +92,7 @@ class VarDTC_minibatch(LatentFunctionInference):
         psi0_full = 0.
         YRY_full = 0.
 
-        for n_start in xrange(0,num_data,batchsize):
+        for n_start in range(0,num_data,batchsize):
             n_end = min(batchsize+n_start, num_data)
             if batchsize==num_data:
                 Y_slice = Y
@@ -170,7 +170,7 @@ class VarDTC_minibatch(LatentFunctionInference):
         Kmm = kern.K(Z).copy()
         diag.add(Kmm, self.const_jitter)
         if not np.isfinite(Kmm).all():
-            print Kmm
+            print(Kmm)
         Lm = jitchol(Kmm)
 
         LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
diff --git a/GPy/inference/mcmc/__init__.py b/GPy/inference/mcmc/__init__.py
index 956448d4..8f185457 100644
--- a/GPy/inference/mcmc/__init__.py
+++ b/GPy/inference/mcmc/__init__.py
@@ -1 +1 @@
-from hmc import HMC
+from .hmc import HMC
diff --git a/GPy/inference/mcmc/hmc.py b/GPy/inference/mcmc/hmc.py
index ec6399b6..fcc72591 100644
--- a/GPy/inference/mcmc/hmc.py
+++ b/GPy/inference/mcmc/hmc.py
@@ -39,7 +39,7 @@ class HMC:
         :rtype: numpy.ndarray
         """
         params = np.empty((num_samples,self.p.size))
-        for i in xrange(num_samples):
+        for i in range(num_samples):
             self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
             H_old = self._computeH()
             theta_old = self.model.optimizer_array.copy()
@@ -59,7 +59,7 @@ class HMC:
         return params
 
     def _update(self, hmc_iters):
-        for i in xrange(hmc_iters):
+        for i in range(hmc_iters):
             self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
             self.model.optimizer_array = self.model.optimizer_array + self.stepsize*np.dot(self.Minv, self.p)
             self.p[:] += -self.stepsize/2.*self.model._transform_gradients(self.model.objective_function_gradients())
@@ -82,7 +82,7 @@ class HMC_shortcut:
 
     def sample(self, m_iters=1000, hmc_iters=20):
         params = np.empty((m_iters,self.p.size))
-        for i in xrange(m_iters):
+        for i in range(m_iters):
             # sample a stepsize from the uniform distribution
             stepsize = np.exp(np.random.rand()*(self.stepsize_range[1]-self.stepsize_range[0])+self.stepsize_range[0])
             self.p[:] = np.random.multivariate_normal(np.zeros(self.p.size),self.M)
diff --git a/GPy/inference/mcmc/samplers.py b/GPy/inference/mcmc/samplers.py
index 444d99d7..6459e8af 100644
--- a/GPy/inference/mcmc/samplers.py
+++ b/GPy/inference/mcmc/samplers.py
@@ -9,7 +9,13 @@ import sys
 import re
 import numdifftools as ndt
 import pdb
-import cPickle
+
+try:
+    #In Python 2, cPickle is faster. It does not exist in Python 3 but the underlying code is always used
+    #if available
+    import cPickle as pickle
+except ImportError:
+    import pickle
 
 
 class Metropolis_Hastings:
@@ -40,7 +46,7 @@ class Metropolis_Hastings:
         fcurrent = self.model.log_likelihood() + self.model.log_prior()
         accepted = np.zeros(Ntotal,dtype=np.bool)
         for it in range(Ntotal):
-            print "sample %d of %d\r"%(it,Ntotal),
+            print("sample %d of %d\r"%(it,Ntotal), end=' ')
             sys.stdout.flush()
             prop = np.random.multivariate_normal(current, self.cov*self.scale*self.scale)
             self.model._set_params_transformed(prop)
diff --git a/GPy/inference/optimization/__init__.py b/GPy/inference/optimization/__init__.py
index 1a8f043b..909f897b 100644
--- a/GPy/inference/optimization/__init__.py
+++ b/GPy/inference/optimization/__init__.py
@@ -1,2 +1,2 @@
-from scg import SCG
-from optimization import *
+from .scg import SCG
+from .optimization import *
diff --git a/GPy/inference/optimization/conjugate_gradient_descent.py b/GPy/inference/optimization/conjugate_gradient_descent.py
index dfc4a48d..fc2d8b61 100644
--- a/GPy/inference/optimization/conjugate_gradient_descent.py
+++ b/GPy/inference/optimization/conjugate_gradient_descent.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012-2014, Max Zwiessele
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from gradient_descent_update_rules import FletcherReeves, \
+from .gradient_descent_update_rules import FletcherReeves, \
     PolakRibiere
 from Queue import Empty
 from multiprocessing import Value
@@ -74,7 +74,7 @@ class _Async_Optimization(Thread):
         if self.outq is not None:
             self.outq.put(self.SENTINEL)
         if self.messages:
-            print ""
+            print("")
         self.runsignal.clear()
 
     def run(self, *args, **kwargs):
@@ -213,7 +213,7 @@ class Async_Optimize(object):
 #                     # print "^C"
 #                     self.runsignal.clear()
 #                     c.join()
-            print "WARNING: callback still running, optimisation done!"
+            print("WARNING: callback still running, optimisation done!")
         return p.result
 
 class CGD(Async_Optimize):
diff --git a/GPy/inference/optimization/optimization.py b/GPy/inference/optimization/optimization.py
index 5aa2ed03..fd140688 100644
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@@ -10,7 +10,7 @@ try:
     rasm_available = True
 except ImportError:
     rasm_available = False
-from scg import SCG
+from .scg import SCG
 
 class Optimizer():
     """
@@ -54,7 +54,7 @@ class Optimizer():
         self.time = str(end - start)
 
     def opt(self, f_fp=None, f=None, fp=None):
-        raise NotImplementedError, "this needs to be implemented to use the optimizer class"
+        raise NotImplementedError("this needs to be implemented to use the optimizer class")
 
     def plot(self):
         """
@@ -125,9 +125,9 @@ class opt_lbfgsb(Optimizer):
 
         opt_dict = {}
         if self.xtol is not None:
-            print "WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it"
+            print("WARNING: l-bfgs-b doesn't have an xtol arg, so I'm going to ignore it")
         if self.ftol is not None:
-            print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
+            print("WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it")
         if self.gtol is not None:
             opt_dict['pgtol'] = self.gtol
         if self.bfgs_factor is not None:
@@ -162,7 +162,7 @@ class opt_simplex(Optimizer):
         if self.ftol is not None:
             opt_dict['ftol'] = self.ftol
         if self.gtol is not None:
-            print "WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it"
+            print("WARNING: simplex doesn't have an gtol arg, so I'm going to ignore it")
 
         opt_result = optimize.fmin(f, self.x_init, (), disp=self.messages,
                    maxfun=self.max_f_eval, full_output=True, **opt_dict)
@@ -190,11 +190,11 @@ class opt_rasm(Optimizer):
 
         opt_dict = {}
         if self.xtol is not None:
-            print "WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an xtol arg, so I'm going to ignore it")
         if self.ftol is not None:
-            print "WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an ftol arg, so I'm going to ignore it")
         if self.gtol is not None:
-            print "WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it"
+            print("WARNING: minimize doesn't have an gtol arg, so I'm going to ignore it")
 
         opt_result = rasm.minimize(self.x_init, f_fp, (), messages=self.messages,
                                    maxnumfuneval=self.max_f_eval)
diff --git a/GPy/inference/optimization/scg.py b/GPy/inference/optimization/scg.py
index 34dd181f..8960de1d 100644
--- a/GPy/inference/optimization/scg.py
+++ b/GPy/inference/optimization/scg.py
@@ -21,14 +21,13 @@
 #      OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 #      POSSIBILITY OF SUCH DAMAGE.
 
-
+from __future__ import print_function
 import numpy as np
 import sys
 
-
 def print_out(len_maxiters, fnow, current_grad, beta, iteration):
-    print '\r',
-    print '{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
+    print('\r', end=' ')
+    print('{0:>0{mi}g}  {1:> 12e}  {2:< 12.6e}  {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), end=' ') # print 'Iteration:', iteration, ' Objective:', fnow, '  Scale:', beta, '\r',
     sys.stdout.flush()
 
 def exponents(fnow, current_grad):
@@ -80,7 +79,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
 
     len_maxiters = len(str(maxiters))
     if display:
-        print ' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters)
+        print(' {0:{mi}s}   {1:11s}    {2:11s}    {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters))
         exps = exponents(fnow, current_grad)
         p_iter = iteration
 
@@ -140,7 +139,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
                 b = np.any(n_exps < exps)
                 if a or b:
                     p_iter = iteration
-                    print ''
+                    print('')
                 if b:
                     exps = n_exps
 
@@ -189,6 +188,6 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
 
     if display:
         print_out(len_maxiters, fnow, current_grad, beta, iteration)
-        print ""
-        print status
+        print("")
+        print(status)
     return x, flog, function_eval, status
diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py
index dc71d539..f1532bc5 100644
--- a/GPy/inference/optimization/stochastics.py
+++ b/GPy/inference/optimization/stochastics.py
@@ -30,7 +30,7 @@ class SparseGPMissing(StochasticStorage):
         Thus, we can just make sure the loop goes over self.d every
         time.
         """
-        self.d = xrange(model.Y_normalized.shape[1])
+        self.d = range(model.Y_normalized.shape[1])
 
 class SparseGPStochastics(StochasticStorage):
     """
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 0e1f8a0d..1c389bea 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -1,21 +1,21 @@
-from _src.kern import Kern
-from _src.rbf import RBF
-from _src.linear import Linear, LinearFull
-from _src.static import Bias, White, Fixed
-from _src.brownian import Brownian
-from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
-from _src.mlp import MLP
-from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
-from _src.independent_outputs import IndependentOutputs, Hierarchical
-from _src.coregionalize import Coregionalize
-from _src.ODE_UY import ODE_UY
-from _src.ODE_UYC import ODE_UYC
-from _src.ODE_st import ODE_st
-from _src.ODE_t import ODE_t
-from _src.poly import Poly
-from _src.eq_ode2 import EQ_ODE2
+from ._src.kern import Kern
+from ._src.rbf import RBF
+from ._src.linear import Linear, LinearFull
+from ._src.static import Bias, White, Fixed
+from ._src.brownian import Brownian
+from ._src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine
+from ._src.mlp import MLP
+from ._src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
+from ._src.independent_outputs import IndependentOutputs, Hierarchical
+from ._src.coregionalize import Coregionalize
+from ._src.ODE_UY import ODE_UY
+from ._src.ODE_UYC import ODE_UYC
+from ._src.ODE_st import ODE_st
+from ._src.ODE_t import ODE_t
+from ._src.poly import Poly
+from ._src.eq_ode2 import EQ_ODE2
+from ._src.trunclinear import TruncLinear,TruncLinear_inf
+from ._src.splitKern import SplitKern,DEtime
+from ._src.splitKern import DEtime as DiffGenomeKern
 
-from _src.trunclinear import TruncLinear,TruncLinear_inf
-from _src.splitKern import SplitKern,DEtime
-from _src.splitKern import DEtime as DiffGenomeKern
 
diff --git a/GPy/kern/_src/ODE_UY.py b/GPy/kern/_src/ODE_UY.py
index b4a2b42d..9c9b47be 100644
--- a/GPy/kern/_src/ODE_UY.py
+++ b/GPy/kern/_src/ODE_UY.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices
 
 class ODE_UY(Kern):
     def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., active_dims=None, name='ode_uy'):
@@ -114,7 +114,7 @@ class ODE_UY(Kern):
                 elif i==1:
                     Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                 else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")
         #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
         #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
         return Kdiag
diff --git a/GPy/kern/_src/ODE_UYC.py b/GPy/kern/_src/ODE_UYC.py
index 1722d2e1..ff75a328 100644
--- a/GPy/kern/_src/ODE_UYC.py
+++ b/GPy/kern/_src/ODE_UYC.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices
 
 class ODE_UYC(Kern):
     def __init__(self, input_dim, variance_U=3., variance_Y=1., lengthscale_U=1., lengthscale_Y=1., ubias =1. ,active_dims=None, name='ode_uyc'):
@@ -115,7 +115,7 @@ class ODE_UYC(Kern):
                 elif i==1:
                     Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                 else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")
         #Kdiag[slices[0][0]]+= self.variance_U   #matern32 diag
         #Kdiag[slices[1][0]]+= self.variance_U*self.variance_Y*(k1+k2+k3)  #  diag
         return Kdiag
diff --git a/GPy/kern/_src/ODE_st.py b/GPy/kern/_src/ODE_st.py
index 665be230..afa46d09 100644
--- a/GPy/kern/_src/ODE_st.py
+++ b/GPy/kern/_src/ODE_st.py
@@ -1,10 +1,10 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices
 
 
 class ODE_st(Kern):
@@ -135,7 +135,7 @@ class ODE_st(Kern):
                     Kdiag[s1]+= b**2*k1 - 2*a*c*k2 + a**2*k3 + c**2*vyt*vyx
                     #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                 else:
-                    raise ValueError, "invalid input/output index"
+                    raise ValueError("invalid input/output index")
 
         return Kdiag
         
diff --git a/GPy/kern/_src/ODE_t.py b/GPy/kern/_src/ODE_t.py
index a470cbec..80625f51 100644
--- a/GPy/kern/_src/ODE_t.py
+++ b/GPy/kern/_src/ODE_t.py
@@ -1,8 +1,8 @@
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
-from independent_outputs import index_to_slices
+from .independent_outputs import index_to_slices
 
 
 class ODE_t(Kern):
@@ -85,7 +85,7 @@ class ODE_t(Kern):
                             Kdiag[s1]+= k1 + vyt+self.ubias
                             #Kdiag[s1]+= Vu*Vy*(k1+k2+k3)
                         else:
-                            raise ValueError, "invalid input/output index"
+                            raise ValueError("invalid input/output index")
 
                 return Kdiag
 
diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py
index 8059f68f..696a8b04 100644
--- a/GPy/kern/_src/add.py
+++ b/GPy/kern/_src/add.py
@@ -4,7 +4,8 @@
 import numpy as np
 import itertools
 from ...util.caching import Cache_this
-from kern import CombinationKernel
+from .kern import CombinationKernel
+from functools import reduce
 
 class Add(CombinationKernel):
     """
@@ -84,10 +85,10 @@ class Add(CombinationKernel):
         psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
         #return psi2
         # compute the "cross" terms
-        from static import White, Bias
-        from rbf import RBF
+        from .static import White, Bias
+        from .rbf import RBF
         #from rbf_inv import RBFInv
-        from linear import Linear
+        from .linear import Linear
         #ffrom fixed import Fixed
 
         for p1, p2 in itertools.combinations(self.parts, 2):
@@ -111,11 +112,11 @@ class Add(CombinationKernel):
                 psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1)
                 #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
             else:
-                raise NotImplementedError, "psi2 cannot be computed for this kernel"
+                raise NotImplementedError("psi2 cannot be computed for this kernel")
         return psi2
 
     def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
         for p1 in self.parts:
             #compute the effective dL_dpsi1. Extra terms appear becaue of the cross terms in psi2!
             eff_dL_dpsi1 = dL_dpsi1.copy()
@@ -131,7 +132,7 @@ class Add(CombinationKernel):
             p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
 
     def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
         target = np.zeros(Z.shape)
         for p1 in self.parts:
             #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
@@ -149,7 +150,7 @@ class Add(CombinationKernel):
         return target
 
     def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
-        from static import White, Bias
+        from .static import White, Bias
         target_grads = [np.zeros(v.shape) for v in variational_posterior.parameters]
         for p1 in self.parameters:
             #compute the effective dL_dpsi1. extra terms appear becaue of the cross terms in psi2!
@@ -164,7 +165,7 @@ class Add(CombinationKernel):
                 else:
                     eff_dL_dpsi1 += dL_dpsi2.sum(0) * p2.psi1(Z, variational_posterior) * 2.
             grads = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior)
-            [np.add(target_grads[i],grads[i],target_grads[i]) for i in xrange(len(grads))]
+            [np.add(target_grads[i],grads[i],target_grads[i]) for i in range(len(grads))]
         return target_grads
 
     def add(self, other):
diff --git a/GPy/kern/_src/brownian.py b/GPy/kern/_src/brownian.py
index fd79973c..d403fce7 100644
--- a/GPy/kern/_src/brownian.py
+++ b/GPy/kern/_src/brownian.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index 291402ec..5b91de1c 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -1,13 +1,17 @@
 # Copyright (c) 2012, James Hensman and Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 import numpy as np
-from scipy import weave
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.config import config # for assesing whether to use weave
 
+try:
+    from scipy import weave
+except ImportError:
+    config.set('weave', 'working', 'False')
+
 class Coregionalize(Kern):
     """
     Covariance function for intrinsic/linear coregionalization models
@@ -61,7 +65,7 @@ class Coregionalize(Kern):
             try:
                 return self._K_weave(X, X2)
             except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
                 config.set('weave', 'working', 'False')
                 return self._K_numpy(X, X2)
         else:
@@ -123,7 +127,7 @@ class Coregionalize(Kern):
             try:
                 dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
             except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
                 config.set('weave', 'working', 'False')
                 dL_dK_small = self._gradient_reduce_weave(dL_dK, index, index2)
         else:
@@ -162,7 +166,7 @@ class Coregionalize(Kern):
 
     def update_gradients_diag(self, dL_dKdiag, X):
         index = np.asarray(X, dtype=np.int).flatten()
-        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in xrange(self.output_dim)])
+        dL_dKdiag_small = np.array([dL_dKdiag[index==i].sum() for i in range(self.output_dim)])
         self.W.gradient = 2.*self.W*dL_dKdiag_small[:, None]
         self.kappa.gradient = dL_dKdiag_small
 
diff --git a/GPy/kern/_src/eq_ode2.py b/GPy/kern/_src/eq_ode2.py
index 59f67b8b..2d42a3e6 100644
--- a/GPy/kern/_src/eq_ode2.py
+++ b/GPy/kern/_src/eq_ode2.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from scipy.special import wofz
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.caching import Cache_this
diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py
index 21958267..aa9dca80 100644
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from kern import Kern, CombinationKernel
+from .kern import Kern, CombinationKernel
 import numpy as np
 import itertools
 
@@ -94,8 +94,10 @@ class IndependentOutputs(CombinationKernel):
         else:
             slices2 = index_to_slices(X2[:,self.index_dim])
             [[[collate_grads(kern, i, dL_dK[s,s2],X[s],X2[s2]) for s in slices_i] for s2 in slices_j] for i,(kern,slices_i,slices_j) in enumerate(zip(kerns,slices,slices2))]
-        if self.single_kern: kern.gradient = target
-        else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
+        if self.single_kern: 
+            self.kern.gradient = target
+        else:
+            [kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
 
     def gradients_X(self,dL_dK, X, X2=None):
         target = np.zeros(X.shape)
@@ -142,7 +144,7 @@ class IndependentOutputs(CombinationKernel):
             if self.single_kern: target[:] += kern.gradient
             else: target[i][:] += kern.gradient
         [[collate_grads(kern, i, dL_dKdiag[s], X[s,:]) for s in slices_i] for i, (kern, slices_i) in enumerate(zip(kerns, slices))]
-        if self.single_kern: kern.gradient = target
+        if self.single_kern: self.kern.gradient = target
         else:[kern.gradient.__setitem__(Ellipsis, target[i]) for i, [kern, _] in enumerate(zip(kerns, slices))]
 
 class Hierarchical(CombinationKernel):
diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py
index 57b2bff5..e63ddad4 100644
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@@ -4,17 +4,20 @@
 import sys
 import numpy as np
 from ...core.parameterization.parameterized import Parameterized
-from kernel_slice_operations import KernCallsViaSlicerMeta
+from .kernel_slice_operations import KernCallsViaSlicerMeta
 from ...util.caching import Cache_this
 from GPy.core.parameterization.observable_array import ObsAr
+from functools import reduce
+import six
 
-
-
+@six.add_metaclass(KernCallsViaSlicerMeta)
 class Kern(Parameterized):
     #===========================================================================
     # This adds input slice support. The rather ugly code for slicing can be
     # found in kernel_slice_operations
-    __metaclass__ = KernCallsViaSlicerMeta
+    # __meataclass__ is ignored in Python 3 - needs to be put in the function definiton
+    #__metaclass__ = KernCallsViaSlicerMeta
+    #Here, we use the Python module six to support Py3 and Py2 simultaneously
     #===========================================================================
     _support_GPU=False
     def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
@@ -178,7 +181,7 @@ class Kern(Parameterized):
 
         """
         assert isinstance(other, Kern), "only kernels can be added to kernels..."
-        from add import Add
+        from .add import Add
         return Add([self, other], name=name)
 
     def __mul__(self, other):
@@ -210,7 +213,7 @@ class Kern(Parameterized):
 
         """
         assert isinstance(other, Kern), "only kernels can be multiplied to kernels..."
-        from prod import Prod
+        from .prod import Prod
         #kernels = []
         #if isinstance(self, Prod): kernels.extend(self.parameters)
         #else: kernels.append(self)
diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py
index 9d1a956b..e3a45c67 100644
--- a/GPy/kern/_src/linear.py
+++ b/GPy/kern/_src/linear.py
@@ -3,7 +3,7 @@
 
 
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...util.linalg import tdot
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
diff --git a/GPy/kern/_src/mlp.py b/GPy/kern/_src/mlp.py
index 16e84363..4488ea82 100644
--- a/GPy/kern/_src/mlp.py
+++ b/GPy/kern/_src/mlp.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 import numpy as np
diff --git a/GPy/kern/_src/periodic.py b/GPy/kern/_src/periodic.py
index e8e16506..23818007 100644
--- a/GPy/kern/_src/periodic.py
+++ b/GPy/kern/_src/periodic.py
@@ -3,11 +3,12 @@
 
 
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...util.linalg import mdot
 from ...util.decorators import silence_errors
 from ...core.parameterization.param import Param
 from ...core.parameterization.transformations import Logexp
+from functools import reduce
 
 class Periodic(Kern):
     def __init__(self, input_dim, variance, lengthscale, period, n_freq, lower, upper, active_dims, name):
diff --git a/GPy/kern/_src/poly.py b/GPy/kern/_src/poly.py
index b90e8f8f..a5306c2a 100644
--- a/GPy/kern/_src/poly.py
+++ b/GPy/kern/_src/poly.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 class Poly(Kern):
diff --git a/GPy/kern/_src/prod.py b/GPy/kern/_src/prod.py
index 63b23f45..ff7cf140 100644
--- a/GPy/kern/_src/prod.py
+++ b/GPy/kern/_src/prod.py
@@ -2,9 +2,10 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from kern import CombinationKernel
+from .kern import CombinationKernel
 from ...util.caching import Cache_this
 import itertools
+from functools import reduce
 
 
 def numpy_invalid_op_as_exception(func):
diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py
index a277ff02..5041da50 100644
--- a/GPy/kern/_src/psi_comp/__init__.py
+++ b/GPy/kern/_src/psi_comp/__init__.py
@@ -4,10 +4,10 @@
 from ....core.parameterization.parameter_core import Pickleable
 from GPy.util.caching import Cache_this
 from ....core.parameterization import variational
-import rbf_psi_comp
-import ssrbf_psi_comp
-import sslinear_psi_comp
-import linear_psi_comp
+from . import rbf_psi_comp
+from . import ssrbf_psi_comp
+from . import sslinear_psi_comp
+from . import linear_psi_comp
 
 class PSICOMP_RBF(Pickleable):
     @Cache_this(limit=2, ignore_args=(0,))
@@ -17,7 +17,7 @@ class PSICOMP_RBF(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
 
     @Cache_this(limit=2, ignore_args=(0,1,2,3))
     def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
@@ -26,7 +26,7 @@ class PSICOMP_RBF(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
 
     def _setup_observers(self):
         pass
@@ -40,7 +40,7 @@ class PSICOMP_Linear(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
 
     @Cache_this(limit=2, ignore_args=(0,1,2,3))
     def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
@@ -49,7 +49,7 @@ class PSICOMP_Linear(Pickleable):
         elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
             return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
         else:
-            raise ValueError, "unknown distriubtion received for psi-statistics"
+            raise ValueError("unknown distriubtion received for psi-statistics")
 
     def _setup_observers(self):
         pass
\ No newline at end of file
diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py
index 0c6a4aef..c6998370 100644
--- a/GPy/kern/_src/rbf.py
+++ b/GPy/kern/_src/rbf.py
@@ -3,9 +3,9 @@
 
 
 import numpy as np
-from stationary import Stationary
-from psi_comp import PSICOMP_RBF
-from psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
+from .stationary import Stationary
+from .psi_comp import PSICOMP_RBF
+from .psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
 from ...util.config import *
 
 class RBF(Stationary):
diff --git a/GPy/kern/_src/splitKern.py b/GPy/kern/_src/splitKern.py
index 3b2e5716..c131dcd8 100644
--- a/GPy/kern/_src/splitKern.py
+++ b/GPy/kern/_src/splitKern.py
@@ -3,7 +3,7 @@ A new kernel
 """
 
 import numpy as np
-from kern import Kern,CombinationKernel
+from .kern import Kern,CombinationKernel
 from .independent_outputs import index_to_slices
 import itertools
 
@@ -104,7 +104,7 @@ class SplitKern(CombinationKernel):
             assert len(slices2)<=2, 'The Split kernel only support two different indices'
             target = np.zeros((X.shape[0], X2.shape[0]))
             # diagonal blocks
-            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            [[target.__setitem__((s,s2), self.kern.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[i], slices2[i])] for i in range(min(len(slices),len(slices2)))]
             if len(slices)>1:
                 [target.__setitem__((s,s2), self.kern_cross.K(X[s,:],X2[s2,:])) for s,s2 in itertools.product(slices[1], slices2[0])]
             if len(slices2)>1:
@@ -135,7 +135,7 @@ class SplitKern(CombinationKernel):
         else:
             assert dL_dK.shape==(X.shape[0],X2.shape[0])
             slices2 = index_to_slices(X2[:,self.index_dim])
-            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in xrange(min(len(slices),len(slices2)))]
+            [[collate_grads(dL_dK[s,s2],X[s],X2[s2]) for s,s2 in itertools.product(slices[i], slices2[i])] for i in range(min(len(slices),len(slices2)))]
             if len(slices)>1:
                 [collate_grads(dL_dK[s,s2], X[s], X2[s2], True) for s,s2 in itertools.product(slices[1], slices2[0])]
             if len(slices2)>1:
diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py
index 7f59f5df..6437c6e5 100644
--- a/GPy/kern/_src/static.py
+++ b/GPy/kern/_src/static.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from kern import Kern
+from .kern import Kern
 import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index 5fa846d5..6bc4b304 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -2,16 +2,21 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.linalg import tdot
 from ... import util
 import numpy as np
-from scipy import integrate, weave
+from scipy import integrate
 from ...util.config import config # for assesing whether to use weave
 from ...util.caching import Cache_this
 
+try:
+    from scipy import weave
+except ImportError:
+    config.set('weave', 'working', 'False')
+
 class Stationary(Kern):
     """
     Stationary kernels (covariance functions).
@@ -65,10 +70,10 @@ class Stationary(Kern):
         self.link_parameters(self.variance, self.lengthscale)
 
     def K_of_r(self, r):
-        raise NotImplementedError, "implement the covariance function as a fn of r to use this class"
+        raise NotImplementedError("implement the covariance function as a fn of r to use this class")
 
     def dK_dr(self, r):
-        raise NotImplementedError, "implement derivative of the covariance function wrt r to use this class"
+        raise NotImplementedError("implement derivative of the covariance function wrt r to use this class")
 
     @Cache_this(limit=5, ignore_args=())
     def K(self, X, X2=None):
@@ -165,11 +170,11 @@ class Stationary(Kern):
                 try:
                     self.lengthscale.gradient = self.weave_lengthscale_grads(tmp, X, X2)
                 except:
-                    print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                    print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
                     config.set('weave', 'working', 'False')
-                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+                    self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in range(self.input_dim)])
             else:
-                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+                self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in range(self.input_dim)])
         else:
             r = self._scaled_dist(X, X2)
             self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
@@ -214,7 +219,7 @@ class Stationary(Kern):
             try:
                 return self.gradients_X_weave(dL_dK, X, X2)
             except:
-                print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+                print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
                 config.set('weave', 'working', 'False')
                 return self.gradients_X_(dL_dK, X, X2)
         else:
@@ -234,7 +239,7 @@ class Stationary(Kern):
 
         #the lower memory way with a loop
         ret = np.empty(X.shape, dtype=np.float64)
-        for q in xrange(self.input_dim):
+        for q in range(self.input_dim):
             np.sum(tmp*(X[:,q][:,None]-X2[:,q][None,:]), axis=1, out=ret[:,q])
         ret /= self.lengthscale**2
 
diff --git a/GPy/kern/_src/symbolic.py b/GPy/kern/_src/symbolic.py
index 006af9dc..c339893a 100644
--- a/GPy/kern/_src/symbolic.py
+++ b/GPy/kern/_src/symbolic.py
@@ -1,7 +1,7 @@
 # Check Matthew Rocklin's blog post.
 import sympy as sym
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.symbolic import Symbolic_core
 
 
@@ -11,7 +11,7 @@ class Symbolic(Kern, Symbolic_core):
     def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', parameters=None, active_dims=None, operators=None, func_modules=[]):
 
         if k is None:
-            raise ValueError, "You must provide an argument for the covariance function."
+            raise ValueError("You must provide an argument for the covariance function.")
 
         Kern.__init__(self, input_dim, active_dims, name=name)
         kdiag = k
diff --git a/GPy/kern/_src/trunclinear.py b/GPy/kern/_src/trunclinear.py
index 4ebd51b6..8c48f134 100644
--- a/GPy/kern/_src/trunclinear.py
+++ b/GPy/kern/_src/trunclinear.py
@@ -3,7 +3,7 @@
 
 
 import numpy as np
-from kern import Kern
+from .kern import Kern
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.caching import Cache_this
diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py
index c1064e92..3157bd5b 100644
--- a/GPy/likelihoods/__init__.py
+++ b/GPy/likelihoods/__init__.py
@@ -1,9 +1,10 @@
-from bernoulli import Bernoulli
-from exponential import Exponential
-from gaussian import Gaussian
-from gamma import Gamma
-from poisson import Poisson
-from student_t import StudentT
-from likelihood import Likelihood
-from mixed_noise import MixedNoise
-from binomial import Binomial
+from .bernoulli import Bernoulli
+from .exponential import Exponential
+from .gaussian import Gaussian
+from .gamma import Gamma
+from .poisson import Poisson
+from .student_t import StudentT
+from .likelihood import Likelihood
+from .mixed_noise import MixedNoise
+from .binomial import Binomial
+
diff --git a/GPy/likelihoods/bernoulli.py b/GPy/likelihoods/bernoulli.py
index 2febda96..e540f016 100644
--- a/GPy/likelihoods/bernoulli.py
+++ b/GPy/likelihoods/bernoulli.py
@@ -2,10 +2,9 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from ..util.univariate_Gaussian import std_norm_cdf, std_norm_pdf
-
-import link_functions
-from likelihood import Likelihood
+from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
+from . import link_functions
+from .likelihood import Likelihood
 
 class Bernoulli(Likelihood):
     """
diff --git a/GPy/likelihoods/binomial.py b/GPy/likelihoods/binomial.py
index 4accaa44..22009968 100644
--- a/GPy/likelihoods/binomial.py
+++ b/GPy/likelihoods/binomial.py
@@ -3,8 +3,8 @@
 
 import numpy as np
 from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 from scipy import special
 
 class Binomial(Likelihood):
diff --git a/GPy/likelihoods/exponential.py b/GPy/likelihoods/exponential.py
index eca6ce52..0a6c543d 100644
--- a/GPy/likelihoods/exponential.py
+++ b/GPy/likelihoods/exponential.py
@@ -5,8 +5,8 @@
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 
 class Exponential(Likelihood):
     """
diff --git a/GPy/likelihoods/gamma.py b/GPy/likelihoods/gamma.py
index 9d742d02..79aba4a5 100644
--- a/GPy/likelihoods/gamma.py
+++ b/GPy/likelihoods/gamma.py
@@ -6,8 +6,8 @@ import numpy as np
 from scipy import stats,special
 import scipy as sp
 from ..core.parameterization import Param
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 
 class Gamma(Likelihood):
     """
diff --git a/GPy/likelihoods/gaussian.py b/GPy/likelihoods/gaussian.py
index f46339d1..9ecf7dbf 100644
--- a/GPy/likelihoods/gaussian.py
+++ b/GPy/likelihoods/gaussian.py
@@ -13,8 +13,8 @@ James 11/12/13
 
 import numpy as np
 from scipy import stats, special
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from scipy import stats
@@ -35,8 +35,8 @@ class Gaussian(Likelihood):
             gp_link = link_functions.Identity()
 
         if not isinstance(gp_link, link_functions.Identity):
-            print "Warning, Exact inference is not implemeted for non-identity link functions,\
-            if you are not already, ensure Laplace inference_method is used"
+            print("Warning, Exact inference is not implemeted for non-identity link functions,\
+            if you are not already, ensure Laplace inference_method is used")
 
         super(Gaussian, self).__init__(gp_link, name=name)
 
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index 9f2f3e7a..c876100d 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -4,7 +4,7 @@
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
+from . import link_functions
 from ..util.misc import chain_1, chain_2, chain_3, blockify_dhess_dtheta, blockify_third, blockify_hessian, safe_exp
 from scipy.integrate import quad
 import warnings
@@ -254,8 +254,8 @@ class Likelihood(Parameterized):
         return mean
 
     def _conditional_mean(self, f):
-        """Quadrature calculation of the conditional mean: E(Y_star|f_star)"""
-        raise NotImplementedError, "implement this function to make predictions"
+        """Quadrature calculation of the conditional mean: E(Y_star|f)"""
+        raise NotImplementedError("implement this function to make predictions")
 
     def predictive_variance(self, mu,variance, predictive_mean=None, Y_metadata=None):
         """
@@ -604,7 +604,7 @@ class Likelihood(Parameterized):
         :param burnin: number of samples to use for burnin (will need modifying)
         :param Y_metadata: Y_metadata for pdf
         """
-        print "Warning, using MCMC for sampling y*, needs to be tuned!"
+        print("Warning, using MCMC for sampling y*, needs to be tuned!")
         if starting_loc is None:
             starting_loc = fNew
         from functools import partial
@@ -658,8 +658,8 @@ class Likelihood(Parameterized):
 
             #Show progress
             if i % int((burn_in+num_samples)*0.1) == 0:
-                print "{}% of samples taken ({})".format((i/int((burn_in+num_samples)*0.1)*10), i)
-                print "Last run accept ratio: ", accept_ratio[i]
+                print("{}% of samples taken ({})".format((i/int((burn_in+num_samples)*0.1)*10), i))
+                print("Last run accept ratio: ", accept_ratio[i])
 
-        print "Average accept ratio: ", np.mean(accept_ratio)
+        print("Average accept ratio: ", np.mean(accept_ratio))
         return chain_values
diff --git a/GPy/likelihoods/link_functions.py b/GPy/likelihoods/link_functions.py
index 6b297f92..41bff1f8 100644
--- a/GPy/likelihoods/link_functions.py
+++ b/GPy/likelihoods/link_functions.py
@@ -177,7 +177,7 @@ class Heaviside(GPTransformation):
         return np.where(f>0, 1, 0)
 
     def dtransf_df(self,f):
-        raise NotImplementedError, "This function is not differentiable!"
+        raise NotImplementedError("This function is not differentiable!")
 
     def d2transf_df2(self,f):
-        raise NotImplementedError, "This function is not differentiable!"
+        raise NotImplementedError("This function is not differentiable!")
diff --git a/GPy/likelihoods/mixed_noise.py b/GPy/likelihoods/mixed_noise.py
index 8c56f45b..84b3001d 100644
--- a/GPy/likelihoods/mixed_noise.py
+++ b/GPy/likelihoods/mixed_noise.py
@@ -3,9 +3,9 @@
 
 import numpy as np
 from scipy import stats, special
-import link_functions
-from likelihood import Likelihood
-from gaussian import Gaussian
+from . import link_functions
+from .likelihood import Likelihood
+from .gaussian import Gaussian
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 from ..core.parameterization import Parameterized
diff --git a/GPy/likelihoods/poisson.py b/GPy/likelihoods/poisson.py
index 6da3160f..5aa85a91 100644
--- a/GPy/likelihoods/poisson.py
+++ b/GPy/likelihoods/poisson.py
@@ -5,8 +5,8 @@ from __future__ import division
 import numpy as np
 from scipy import stats,special
 import scipy as sp
-import link_functions
-from likelihood import Likelihood
+from . import link_functions
+from .likelihood import Likelihood
 
 class Poisson(Likelihood):
     """
diff --git a/GPy/likelihoods/student_t.py b/GPy/likelihoods/student_t.py
index 674972bf..b66d4c0f 100644
--- a/GPy/likelihoods/student_t.py
+++ b/GPy/likelihoods/student_t.py
@@ -4,10 +4,10 @@
 import numpy as np
 from scipy import stats, special
 import scipy as sp
-import link_functions
+from . import link_functions
 from scipy import stats, integrate
 from scipy.special import gammaln, gamma
-from likelihood import Likelihood
+from .likelihood import Likelihood
 from ..core.parameterization import Param
 from ..core.parameterization.transformations import Logexp
 
diff --git a/GPy/mappings/__init__.py b/GPy/mappings/__init__.py
index b1cb194b..5193a232 100644
--- a/GPy/mappings/__init__.py
+++ b/GPy/mappings/__init__.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from kernel import Kernel
-from linear import Linear
-from mlp import MLP
-from additive import Additive
-from compound import Compound
+from .kernel import Kernel
+from .linear import Linear
+from .mlp import MLP
+from .additive import Additive
+from .compound import Compound
+
diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py
index c6abb5de..8f8fd838 100644
--- a/GPy/models/__init__.py
+++ b/GPy/models/__init__.py
@@ -1,23 +1,23 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
-from gp_regression import GPRegression
-from gp_classification import GPClassification
-from sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
-from sparse_gp_classification import SparseGPClassification
-from gplvm import GPLVM
-from bcgplvm import BCGPLVM
-from sparse_gplvm import SparseGPLVM
-from warped_gp import WarpedGP
-from bayesian_gplvm import BayesianGPLVM
-from mrd import MRD
-from gradient_checker import GradientChecker
-from ss_gplvm import SSGPLVM
-from gp_coregionalized_regression import GPCoregionalizedRegression
-from sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
-from gp_heteroscedastic_regression import GPHeteroscedasticRegression
-from ss_mrd import SSMRD
-from gp_kronecker_gaussian_regression import GPKroneckerGaussianRegression
-from gp_var_gauss import GPVariationalGaussianApproximation
-from one_vs_all_classification import OneVsAllClassification
-from one_vs_all_sparse_classification import OneVsAllSparseClassification
+from .gp_regression import GPRegression
+from .gp_classification import GPClassification
+from .sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
+from .sparse_gp_classification import SparseGPClassification
+from .gplvm import GPLVM
+from .bcgplvm import BCGPLVM
+from .sparse_gplvm import SparseGPLVM
+from .warped_gp import WarpedGP
+from .bayesian_gplvm import BayesianGPLVM
+from .mrd import MRD
+from .gradient_checker import GradientChecker
+from .ss_gplvm import SSGPLVM
+from .gp_coregionalized_regression import GPCoregionalizedRegression
+from .sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
+from .gp_heteroscedastic_regression import GPHeteroscedasticRegression
+from .ss_mrd import SSMRD
+from .gp_kronecker_gaussian_regression import GPKroneckerGaussianRegression
+from .gp_var_gauss import GPVariationalGaussianApproximation
+from .one_vs_all_classification import OneVsAllClassification
+from .one_vs_all_sparse_classification import OneVsAllSparseClassification
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index f3e643c9..be01b769 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -74,6 +74,8 @@ class MRD(BayesianGPLVMMiniBatch):
 
         self.logger.debug("creating observable arrays")
         self.Ylist = [ObsAr(Y) for Y in Ylist]
+        #The next line is a fix for Python 3. It replicates the python 2 behaviour from the above comprehension
+        Y = Ylist[-1]
 
         if Ynames is None:
             self.logger.debug("creating Ynames")
@@ -82,7 +84,7 @@ class MRD(BayesianGPLVMMiniBatch):
         assert len(self.names) == len(self.Ylist), "one name per dataset, or None if Ylist is a dict"
 
         if inference_method is None:
-            self.inference_method = InferenceMethodList([VarDTC() for _ in xrange(len(self.Ylist))])
+            self.inference_method = InferenceMethodList([VarDTC() for _ in range(len(self.Ylist))])
         else:
             assert isinstance(inference_method, InferenceMethodList), "please provide one inference method per Y in the list and provide it as InferenceMethodList, inference_method given: {}".format(inference_method)
             self.inference_method = inference_method
@@ -137,7 +139,7 @@ class MRD(BayesianGPLVMMiniBatch):
 
         self.bgplvms = []
 
-        for i, n, k, l, Y, im, bs in itertools.izip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
+        for i, n, k, l, Y, im, bs in zip(itertools.count(), Ynames, kernels, likelihoods, Ylist, self.inference_method, batchsize):
             assert Y.shape[0] == self.num_data, "All datasets need to share the number of datapoints, and those have to correspond to one another"
             md = np.isnan(Y).any()
             spgp = BayesianGPLVMMiniBatch(Y, input_dim, X, X_variance,
@@ -164,7 +166,7 @@ class MRD(BayesianGPLVMMiniBatch):
         self._log_marginal_likelihood = 0
         self.Z.gradient[:] = 0.
         self.X.gradient[:] = 0.
-        for b, i in itertools.izip(self.bgplvms, self.inference_method):
+        for b, i in zip(self.bgplvms, self.inference_method):
             self._log_marginal_likelihood += b._log_marginal_likelihood
 
             self.logger.info('working on im <{}>'.format(hex(id(i))))
@@ -195,7 +197,7 @@ class MRD(BayesianGPLVMMiniBatch):
         elif init in "PCA_single":
             X = np.zeros((Ylist[0].shape[0], self.input_dim))
             fracs = []
-            for qs, Y in itertools.izip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
+            for qs, Y in zip(np.array_split(np.arange(self.input_dim), len(Ylist)), Ylist):
                 x,frcs = initialize_latent('PCA', len(qs), Y)
                 X[:, qs] = x
                 fracs.append(frcs)
@@ -327,9 +329,9 @@ class MRD(BayesianGPLVMMiniBatch):
 
     def __getstate__(self):
         state = super(MRD, self).__getstate__()
-        if state.has_key('kern'):
+        if 'kern' in state:
             del state['kern']
-        if state.has_key('likelihood'):
+        if 'likelihood' in state:
             del state['likelihood']
         return state
 
@@ -338,4 +340,4 @@ class MRD(BayesianGPLVMMiniBatch):
         super(MRD, self).__setstate__(state)
         self.kern = self.bgplvms[0].kern
         self.likelihood = self.bgplvms[0].likelihood
-        self.parameters_changed()
\ No newline at end of file
+        self.parameters_changed()
diff --git a/GPy/models/one_vs_all_sparse_classification.py b/GPy/models/one_vs_all_sparse_classification.py
index 3bdd2647..7528ffd2 100644
--- a/GPy/models/one_vs_all_sparse_classification.py
+++ b/GPy/models/one_vs_all_sparse_classification.py
@@ -30,7 +30,7 @@ class OneVsAllSparseClassification(object):
 
         self.results = {}
         for yj in labels:
-            print 'Class %s vs all' %yj
+            print('Class %s vs all' %yj)
             Ynew = Y.copy()
             Ynew[Y.flatten()!=yj] = 0
             Ynew[Y.flatten()==yj] = 1
diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py
index 61fc0bb7..ad62043a 100644
--- a/GPy/models/sparse_gp_minibatch.py
+++ b/GPy/models/sparse_gp_minibatch.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
+from __future__ import print_function
 import numpy as np
 from ..core.parameterization.param import Param
 from ..core.sparse_gp import SparseGP
@@ -50,8 +51,8 @@ class SparseGPMiniBatch(SparseGP):
                 inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
             else:
                 #inference_method = ??
-                raise NotImplementedError, "what to do what to do?"
-            print "defaulting to ", inference_method, "for latent function inference"
+                raise NotImplementedError("what to do what to do?")
+            print("defaulting to ", inference_method, "for latent function inference")
 
         self.kl_factr = 1.
         self.Z = Param('inducing inputs', Z)
@@ -81,13 +82,13 @@ class SparseGPMiniBatch(SparseGP):
             overall = self.Y_normalized.shape[1]
             m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
             message = m_f(-1)
-            print message,
-            for d in xrange(overall):
+            print(message, end=' ')
+            for d in range(overall):
                 self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
-                print ' '*(len(message)+1) + '\r',
+                print(' '*(len(message)+1) + '\r', end=' ')
                 message = m_f(d)
-                print message,
-            print ''
+                print(message, end=' ')
+            print('')
 
         self.posterior = None
 
@@ -182,11 +183,11 @@ class SparseGPMiniBatch(SparseGP):
             full_values[key][value_indices[key]] += current_values[key]
         """
         for key in current_values.keys():
-            if value_indices is not None and value_indices.has_key(key):
+            if value_indices is not None and key in value_indices:
                 index = value_indices[key]
             else:
                 index = slice(None)
-            if full_values.has_key(key):
+            if key in full_values:
                 full_values[key][index] += current_values[key]
             else:
                 full_values[key] = current_values[key]
@@ -242,15 +243,15 @@ class SparseGPMiniBatch(SparseGP):
         if not self.stochastics:
             m_f = lambda i: "Inference with missing_data: {: >7.2%}".format(float(i+1)/self.output_dim)
             message = m_f(-1)
-            print message,
+            print(message, end=' ')
 
         for d in self.stochastics.d:
             ninan = self.ninan[:, d]
 
             if not self.stochastics:
-                print ' '*(len(message)) + '\r',
+                print(' '*(len(message)) + '\r', end=' ')
                 message = m_f(d)
-                print message,
+                print(message, end=' ')
 
             posterior, log_marginal_likelihood, \
                 grad_dict, current_values, value_indices = self._inner_parameters_changed(
@@ -269,7 +270,7 @@ class SparseGPMiniBatch(SparseGP):
             woodbury_vector[:, d:d+1] = posterior.woodbury_vector
             self._log_marginal_likelihood += log_marginal_likelihood
         if not self.stochastics:
-            print ''
+            print('')
 
         if self.posterior is None:
             self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index 04006d84..0f3b8fdd 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -74,7 +74,7 @@ class SSGPLVM(SparseGP_MPI):
         self.link_parameter(self.X, index=0)
                 
         if self.group_spike:
-            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in xrange(self.X.gamma.shape[1])] # Tie columns together
+            [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in range(self.X.gamma.shape[1])] # Tie columns together
         
     def set_X_gradients(self, X, X_grad):
         """Set the gradients of the posterior distribution of X in its specific form."""
diff --git a/GPy/models/ss_mrd.py b/GPy/models/ss_mrd.py
index 036ac095..bd2efce0 100644
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@@ -19,10 +19,10 @@ class SSMRD(Model):
                                name='model_'+str(i)) for i,y in enumerate(Ylist)]
         self.add_parameters(*(self.models))
         
-        [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in xrange(len(self.models))] for j in xrange(self.models[0].X.mean.shape[1])] 
-         for i in xrange(self.models[0].X.mean.shape[0])]
-        [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in xrange(len(self.models))] for j in xrange(self.models[0].X.variance.shape[1])] 
-         for i in xrange(self.models[0].X.variance.shape[0])]
+        [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.mean.shape[1])] 
+         for i in range(self.models[0].X.mean.shape[0])]
+        [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.variance.shape[1])] 
+         for i in range(self.models[0].X.variance.shape[0])]
         
         self.updates = True
         
@@ -31,4 +31,4 @@ class SSMRD(Model):
         self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
 
     def log_likelihood(self):
-        return self._log_marginal_likelihood
\ No newline at end of file
+        return self._log_marginal_likelihood
diff --git a/GPy/plotting/__init__.py b/GPy/plotting/__init__.py
index d3a96914..9dd84441 100644
--- a/GPy/plotting/__init__.py
+++ b/GPy/plotting/__init__.py
@@ -2,6 +2,6 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 try:
-    import matplot_dep
+    from . import matplot_dep
 except (ImportError, NameError):
-    print 'Fail to load GPy.plotting.matplot_dep.'
\ No newline at end of file
+    print('Fail to load GPy.plotting.matplot_dep.')
\ No newline at end of file
diff --git a/GPy/plotting/matplot_dep/base_plots.py b/GPy/plotting/matplot_dep/base_plots.py
index b4142342..f25aee49 100644
--- a/GPy/plotting/matplot_dep/base_plots.py
+++ b/GPy/plotting/matplot_dep/base_plots.py
@@ -133,7 +133,7 @@ def x_frame1D(X,plot_limits=None,resolution=None):
     elif len(plot_limits)==2:
         xmin, xmax = plot_limits
     else:
-        raise ValueError, "Bad limits for plotting"
+        raise ValueError("Bad limits for plotting")
 
     Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
     return Xnew, xmin, xmax
@@ -149,7 +149,7 @@ def x_frame2D(X,plot_limits=None,resolution=None):
     elif len(plot_limits)==2:
         xmin, xmax = plot_limits
     else:
-        raise ValueError, "Bad limits for plotting"
+        raise ValueError("Bad limits for plotting")
 
     resolution = resolution or 50
     xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py
index 1398b40c..2c243e13 100644
--- a/GPy/plotting/matplot_dep/dim_reduction_plots.py
+++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py
@@ -27,7 +27,7 @@ def most_significant_input_dimensions(model, which_indices):
             try:
                 input_1, input_2 = np.argsort(model.input_sensitivity())[::-1][:2]
             except:
-                raise ValueError, "cannot automatically determine which dimensions to plot, please pass 'which_indices'"
+                raise ValueError("cannot automatically determine which dimensions to plot, please pass 'which_indices'")
     else:
         input_1, input_2 = which_indices
     return input_1, input_2
@@ -62,7 +62,7 @@ def plot_latent(model, labels=None, which_indices=None,
 
 
     if X.shape[0] > 1000:
-        print "Warning: subsampling X, as it has more samples then 1000. X.shape={!s}".format(X.shape)
+        print("Warning: subsampling X, as it has more samples then 1000. X.shape={!s}".format(X.shape))
         subsample = np.random.choice(X.shape[0], size=1000, replace=False)
         X = X[subsample]
         labels = labels[subsample]
@@ -133,7 +133,7 @@ def plot_latent(model, labels=None, which_indices=None,
         try:
             xmin, xmax, ymin, ymax = plot_limits
         except (TypeError, ValueError) as e:
-            raise e.__class__, "Wrong plot limits: {} given -> need (xmin, xmax, ymin, ymax)".format(plot_limits)
+            raise e.__class__("Wrong plot limits: {} given -> need (xmin, xmax, ymin, ymax)".format(plot_limits))
     view = ImshowController(ax, plot_function,
                             (xmin, ymin, xmax, ymax),
                             resolution, aspect=aspect, interpolation='bilinear',
@@ -187,14 +187,14 @@ def plot_latent(model, labels=None, which_indices=None,
         fig.tight_layout()
         fig.canvas.draw()
     except Exception as e:
-        print "Could not invoke tight layout: {}".format(e)
+        print("Could not invoke tight layout: {}".format(e))
         pass
 
     if updates:
         try:
             ax.figure.canvas.show()
         except Exception as e:
-            print "Could not invoke show: {}".format(e)
+            print("Could not invoke show: {}".format(e))
         raw_input('Enter to continue')
         view.deactivate()
     return ax
diff --git a/GPy/plotting/matplot_dep/img_plots.py b/GPy/plotting/matplot_dep/img_plots.py
index 453a904d..5346545d 100644
--- a/GPy/plotting/matplot_dep/img_plots.py
+++ b/GPy/plotting/matplot_dep/img_plots.py
@@ -50,8 +50,8 @@ def plot_2D_images(figure, arr, symmetric=False, pad=None, zoom=None, mode=None,
     
     buf = np.ones((y_size*fig_nrows+pad*(fig_nrows-1), x_size*fig_ncols+pad*(fig_ncols-1), 3),dtype=arr.dtype)
     
-    for y in xrange(fig_nrows):
-        for x in xrange(fig_ncols):
+    for y in range(fig_nrows):
+        for x in range(fig_ncols):
             if y*fig_ncols+x<fig_num:
                 buf[y*y_size+y*pad:(y+1)*y_size+y*pad, x*x_size+x*pad:(x+1)*x_size+x*pad] = arr_color[y*fig_ncols+x,:,:,:3]
     img_plot = ax.imshow(buf, interpolation=interpolation)
diff --git a/GPy/plotting/matplot_dep/inference_plots.py b/GPy/plotting/matplot_dep/inference_plots.py
index c802932c..02007390 100644
--- a/GPy/plotting/matplot_dep/inference_plots.py
+++ b/GPy/plotting/matplot_dep/inference_plots.py
@@ -12,7 +12,7 @@ except:
 
 def plot_optimizer(optimizer):
     if optimizer.trace == None:
-        print "No trace present so I can't plot it. Please check that the optimizer actually supplies a trace."
+        print("No trace present so I can't plot it. Please check that the optimizer actually supplies a trace.")
     else:
         pb.figure()
         pb.plot(optimizer.trace)
diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py
index 347e3d08..aa015009 100644
--- a/GPy/plotting/matplot_dep/kernel_plots.py
+++ b/GPy/plotting/matplot_dep/kernel_plots.py
@@ -81,7 +81,7 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False, filtering=Non
             last_bottom = ard_params[i,:]
             bottom += last_bottom
         else:
-            print "filtering out {}".format(kernel.parameters[i].name)
+            print("filtering out {}".format(kernel.parameters[i].name))
 
     ax.set_xlim(-.5, kernel.input_dim - .5)
     add_bar_labels(fig, ax, [bars[-1]], bottom=bottom-last_bottom)
@@ -132,7 +132,7 @@ def plot(kernel,x=None, fignum=None, ax=None, title=None, plot_limits=None, reso
         elif len(plot_limits) == 2:
             xmin, xmax = plot_limits
         else:
-            raise ValueError, "Bad limits for plotting"
+            raise ValueError("Bad limits for plotting")
 
         Xnew = np.linspace(xmin, xmax, resolution or 201)[:, None]
         Kx = kernel.K(Xnew, x)
@@ -154,7 +154,7 @@ def plot(kernel,x=None, fignum=None, ax=None, title=None, plot_limits=None, reso
         elif len(plot_limits) == 2:
             xmin, xmax = plot_limits
         else:
-            raise ValueError, "Bad limits for plotting"
+            raise ValueError("Bad limits for plotting")
 
         resolution = resolution or 51
         xx, yy = np.mgrid[xmin[0]:xmax[0]:1j * resolution, xmin[1]:xmax[1]:1j * resolution]
@@ -168,4 +168,4 @@ def plot(kernel,x=None, fignum=None, ax=None, title=None, plot_limits=None, reso
         ax.set_ylabel("x2")
         ax.set_title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1]))
     else:
-        raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions"
+        raise NotImplementedError("Cannot plot a kernel with more than two input dimensions")
diff --git a/GPy/plotting/matplot_dep/mapping_plots.py b/GPy/plotting/matplot_dep/mapping_plots.py
index 6156687d..53bc1de2 100644
--- a/GPy/plotting/matplot_dep/mapping_plots.py
+++ b/GPy/plotting/matplot_dep/mapping_plots.py
@@ -81,4 +81,4 @@ def plot_mapping(self, plot_limits=None, which_data='all', which_parts='all', re
         ax.set_ylim(xmin[1], xmax[1])
 
     else:
-        raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
diff --git a/GPy/plotting/matplot_dep/maps.py b/GPy/plotting/matplot_dep/maps.py
index a651f34d..f38c5d93 100644
--- a/GPy/plotting/matplot_dep/maps.py
+++ b/GPy/plotting/matplot_dep/maps.py
@@ -38,7 +38,7 @@ def plot(shape_records,facecolor='w',edgecolor='k',linewidths=.5, ax=None,xlims=
         par = list(sparts) + [points.shape[0]]
 
         polygs = []
-        for pj in xrange(len(sparts)):
+        for pj in range(len(sparts)):
             polygs.append(Polygon(points[par[pj]:par[pj+1]]))
         ax.add_collection(PatchCollection(polygs,facecolor=facecolor,edgecolor=edgecolor, linewidths=linewidths))
 
@@ -163,10 +163,10 @@ def new_shape_string(sf,name,regex,field=2,type=None):
 
         newshp.line(parts=_parts)
         newshp.records.append(sr.record)
-        print len(sr.record)
+        print(len(sr.record))
 
     newshp.save(name)
-    print index
+    print(index)
 
 def apply_bbox(sf,ax):
     """
diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py
index d2d5a8e2..5cdf69fc 100644
--- a/GPy/plotting/matplot_dep/models_plots.py
+++ b/GPy/plotting/matplot_dep/models_plots.py
@@ -175,7 +175,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all',
             plots['inducing_inputs'] = ax.plot(Zu[:,0], Zu[:,1], 'wo')
 
     else:
-        raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
     return plots
 
 def plot_fit_f(model, *args, **kwargs):
diff --git a/GPy/plotting/matplot_dep/priors_plots.py b/GPy/plotting/matplot_dep/priors_plots.py
index 8f02a03b..39dad631 100644
--- a/GPy/plotting/matplot_dep/priors_plots.py
+++ b/GPy/plotting/matplot_dep/priors_plots.py
@@ -29,4 +29,4 @@ def plot(prior):
         pb.contour(xx, yy, zz, linewidths=2)
 
     else:
-        raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
+        raise NotImplementedError("Cannot define a frame with more than two input dimensions")
diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py
index 9ff41730..b57b7dfc 100644
--- a/GPy/plotting/matplot_dep/visualize.py
+++ b/GPy/plotting/matplot_dep/visualize.py
@@ -25,10 +25,10 @@ class data_show:
         # If no axes are defined, create some.
 
     def modify(self, vals):
-        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+        raise NotImplementedError("this needs to be implemented to use the data_show class")
 
     def close(self):
-        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+        raise NotImplementedError("this needs to be implemented to use the data_show class")
 
 class vpython_show(data_show):
     """
@@ -225,8 +225,8 @@ class lvm_dimselect(lvm):
         self.labels = labels
         lvm.__init__(self,vals,model,data_visualize,latent_axes,sense_axes,latent_index)
         self.show_sensitivities()
-        print self.latent_values
-        print "use left and right mouse buttons to select dimensions"
+        print(self.latent_values)
+        print("use left and right mouse buttons to select dimensions")
 
 
     def on_click(self, event):
@@ -255,7 +255,7 @@ class lvm_dimselect(lvm):
 
 
     def on_leave(self,event):
-        print type(self.latent_values)
+        print(type(self.latent_values))
         latent_values = self.latent_values.copy()
         y = self.model.predict(latent_values[None,:])[0]
         self.data_visualize.modify(y)
@@ -403,7 +403,7 @@ class mocap_data_show_vpython(vpython_show):
         self.modify_vertices()
 
     def process_values(self):
-        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+        raise NotImplementedError("this needs to be implemented to use the data_show class")
 
 class mocap_data_show(matplotlib_show):
     """Base class for visualizing motion capture data."""
@@ -455,11 +455,11 @@ class mocap_data_show(matplotlib_show):
         self.axes.figure.canvas.draw()
 
     def process_values(self):
-        raise NotImplementedError, "this needs to be implemented to use the data_show class"
+        raise NotImplementedError("this needs to be implemented to use the data_show class")
 
     def initialize_axes(self, boundary=0.05):
         """Set up the axes with the right limits and scaling."""
-        bs = [(self.vals[:, i].max()-self.vals[:, i].min())*boundary for i in xrange(3)]
+        bs = [(self.vals[:, i].max()-self.vals[:, i].min())*boundary for i in range(3)]
         self.x_lim = np.array([self.vals[:, 0].min()-bs[0], self.vals[:, 0].max()+bs[0]])
         self.y_lim = np.array([self.vals[:, 1].min()-bs[1], self.vals[:, 1].max()+bs[1]])
         self.z_lim = np.array([self.vals[:, 2].min()-bs[2], self.vals[:, 2].max()+bs[2]])
diff --git a/GPy/testing/examples_tests.py b/GPy/testing/examples_tests.py
index be26fff6..48a18119 100644
--- a/GPy/testing/examples_tests.py
+++ b/GPy/testing/examples_tests.py
@@ -46,20 +46,20 @@ def test_models():
     for loader, module_name, is_pkg in pkgutil.iter_modules([examples_path]):
         # Load examples
         module_examples = loader.find_module(module_name).load_module(module_name)
-        print "MODULE", module_examples
-        print "Before"
-        print inspect.getmembers(module_examples, predicate=inspect.isfunction)
+        print("MODULE", module_examples)
+        print("Before")
+        print(inspect.getmembers(module_examples, predicate=inspect.isfunction))
         functions = [ func for func in inspect.getmembers(module_examples, predicate=inspect.isfunction) if func[0].startswith('_') is False ][::-1]
-        print "After"
-        print functions
+        print("After")
+        print(functions)
         for example in functions:
             if example[0] in ['epomeo_gpx']:
                 #These are the edge cases that we might want to handle specially
                 if example[0] == 'epomeo_gpx' and not GPy.util.datasets.gpxpy_available:
-                    print "Skipping as gpxpy is not available to parse GPS"
+                    print("Skipping as gpxpy is not available to parse GPS")
                     continue
 
-            print "Testing example: ", example[0]
+            print("Testing example: ", example[0])
             # Generate model
 
             try:
@@ -69,7 +69,7 @@ def test_models():
             except Exception as e:
                 failing_models[example[0]] = "Cannot make model: \n{e}".format(e=e)
             else:
-                print models
+                print(models)
                 model_checkgrads.description = 'test_checkgrads_%s' % example[0]
                 try:
                     for model in models:
@@ -89,17 +89,17 @@ def test_models():
             #yield model_checkgrads, model
             #yield model_instance, model
 
-        print "Finished checking module {m}".format(m=module_name)
+        print("Finished checking module {m}".format(m=module_name))
         if len(failing_models.keys()) > 0:
-            print "Failing models: "
-            print failing_models
+            print("Failing models: ")
+            print(failing_models)
 
     if len(failing_models.keys()) > 0:
-        print failing_models
+        print(failing_models)
         raise Exception(failing_models)
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     # unittest.main()
     test_models()
diff --git a/GPy/testing/index_operations_tests.py b/GPy/testing/index_operations_tests.py
index e5c2011a..a97f1beb 100644
--- a/GPy/testing/index_operations_tests.py
+++ b/GPy/testing/index_operations_tests.py
@@ -121,14 +121,16 @@ class Test(unittest.TestCase):
         self.assertListEqual(removed.tolist(), [0, 2])
 
     def test_misc(self):
-        for k,v in self.param_index.copy()._properties.iteritems():
+        #py3 fix
+        #for k,v in self.param_index.copy()._properties.iteritems():
+        for k,v in self.param_index.copy()._properties.items():
             self.assertListEqual(self.param_index[k].tolist(), v.tolist())
         self.assertEqual(self.param_index.size, 8)
         self.assertEqual(self.view.size, 5)
 
     def test_print(self):
-        print self.param_index
-        print self.view
+        print(self.param_index)
+        print(self.view)
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_index_view']
diff --git a/GPy/testing/inference_tests.py b/GPy/testing/inference_tests.py
index d5039049..e09df1fe 100644
--- a/GPy/testing/inference_tests.py
+++ b/GPy/testing/inference_tests.py
@@ -64,7 +64,7 @@ class InferenceXTestCase(unittest.TestCase):
         m.optimize(max_iters=10000)
         x, mi = m.infer_newX(m.Y)
 
-        print m.X.mean - mi.X.mean
+        print(m.X.mean - mi.X.mean)
         self.assertTrue(np.allclose(m.X.mean, mi.X.mean, rtol=1e-4, atol=1e-4))
         self.assertTrue(np.allclose(m.X.variance, mi.X.variance, rtol=1e-4, atol=1e-4))
 
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 458f5cd8..f3d82216 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -37,7 +37,7 @@ class Kern_check_model(GPy.core.Model):
     def is_positive_semi_definite(self):
         v = np.linalg.eig(self.kernel.K(self.X))[0]
         if any(v.real<=-1e-10):
-            print v.real.min()
+            print(v.real.min())
             return False
         else:
             return True
@@ -126,7 +126,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Positive definite check failed for " + kern.name + " covariance function.")
+        print(("Positive definite check failed for " + kern.name + " covariance function."))
         pass_checks = False
         assert(result)
         return False
@@ -137,7 +137,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:"))
         Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
         pass_checks = False
         assert(result)
@@ -149,7 +149,7 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:"))
         Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
         pass_checks = False
         assert(result)
@@ -162,11 +162,11 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     except NotImplementedError:
         result=True
         if verbose:
-            print("update_gradients_diag not implemented for " + kern.name)
+            print(("update_gradients_diag not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:"))
         Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
         pass_checks = False
         assert(result)
@@ -182,13 +182,12 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     except NotImplementedError:
         result=True
         if verbose:
-            print("gradients_X not implemented for " + kern.name)
+            print(("gradients_X not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:"))
         testmodel.checkgrad(verbose=True)
-        import ipdb;ipdb.set_trace()
         assert(result)
         pass_checks = False
         return False
@@ -203,11 +202,11 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     except NotImplementedError:
         result=True
         if verbose:
-            print("gradients_X not implemented for " + kern.name)
+            print(("gradients_X not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of K(X, X2) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of K(X, X2) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:"))
         testmodel.checkgrad(verbose=True)
         assert(result)
         pass_checks = False
@@ -223,11 +222,11 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
     except NotImplementedError:
         result=True
         if verbose:
-            print("gradients_X not implemented for " + kern.name)
+            print(("gradients_X not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
-        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        print(("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:"))
         Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
         pass_checks = False
         assert(result)
@@ -292,7 +291,7 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         try:
             k.K(self.X)
         except AssertionError:
-            raise AssertionError, "k.K(X) should run on self.D-1 dimension"
+            raise AssertionError("k.K(X) should run on self.D-1 dimension")
 
     def test_Matern52(self):
         k = GPy.kern.Matern52(self.D)
@@ -429,7 +428,7 @@ class KernelTestsProductWithZeroValues(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     unittest.main()
 
 #     np.random.seed(0)
diff --git a/GPy/testing/likelihood_tests.py b/GPy/testing/likelihood_tests.py
index 7b6164c1..7fa5886f 100644
--- a/GPy/testing/likelihood_tests.py
+++ b/GPy/testing/likelihood_tests.py
@@ -27,9 +27,9 @@ def dparam_partial(inst_func, *args):
           param
     """
     def param_func(param_val, param_name, inst_func, args):
-        #inst_func.im_self._set_params(param)
-        #inst_func.im_self.add_parameter(Param(param_name, param_val))
-        inst_func.im_self[param_name] = param_val
+        #inst_func.__self__._set_params(param)
+        #inst_func.__self__.add_parameter(Param(param_name, param_val))
+        inst_func.__self__[param_name] = param_val
         return inst_func(*args)
     return functools.partial(param_func, inst_func=inst_func, args=args)
 
@@ -44,8 +44,8 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
     The number of parameters and N is the number of data
     Need to take a slice out from f and a slice out of df
     """
-    print "\n{} likelihood: {} vs {}".format(func.im_self.__class__.__name__,
-                                           func.__name__, dfunc.__name__)
+    print("\n{} likelihood: {} vs {}".format(func.__self__.__class__.__name__,
+                                           func.__name__, dfunc.__name__))
     partial_f = dparam_partial(func, *args)
     partial_df = dparam_partial(dfunc, *args)
     gradchecking = True
@@ -66,7 +66,7 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
         for fixed_val in range(dfnum):
             #dlik and dlik_dvar gives back 1 value for each
             f_ind = min(fnum, fixed_val+1) - 1
-            print "fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val)
+            print("fnum: {} dfnum: {} f_ind: {} fixed_val: {}".format(fnum, dfnum, f_ind, fixed_val))
             #Make grad checker with this param moving, note that set_params is NOT being called
             #The parameter is being set directly with __setattr__
             #Check only the parameter and function value we wish to check at a time
@@ -83,12 +83,12 @@ def dparam_checkgrad(func, dfunc, params, params_names, args, constraints=None,
                     if grad.grep_param_names(constrain_param):
                         constraint(constrain_param, grad)
                     else:
-                        print "parameter didn't exist"
-                    print constrain_param, " ", constraint
+                        print("parameter didn't exist")
+                    print(constrain_param, " ", constraint)
             if randomize:
                 grad.randomize()
             if verbose:
-                print grad
+                print(grad)
                 grad.checkgrad(verbose=1)
             if not grad.checkgrad(verbose=True):
                 gradchecking = False
@@ -297,7 +297,7 @@ class TestNoiseModels(object):
     def test_scale2_models(self):
         self.setUp()
 
-        for name, attributes in self.noise_models.iteritems():
+        for name, attributes in self.noise_models.items():
             model = attributes["model"]
             if "grad_params" in attributes:
                 params = attributes["grad_params"]
@@ -373,8 +373,8 @@ class TestNoiseModels(object):
     #############
     @with_setup(setUp, tearDown)
     def t_logpdf(self, model, Y, f, Y_metadata):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         #print model._get_params()
         np.testing.assert_almost_equal(
                 model.pdf(f.copy(), Y.copy(), Y_metadata=Y_metadata).prod(),
@@ -383,33 +383,33 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_dlogpdf_df(self, model, Y, f, Y_metadata):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         self.description = "\n{}".format(inspect.stack()[0][3])
         logpdf = functools.partial(np.sum(model.logpdf), y=Y, Y_metadata=Y_metadata)
         dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(logpdf, dlogpdf_df, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
     def t_d2logpdf_df2(self, model, Y, f, Y_metadata):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         dlogpdf_df = functools.partial(model.dlogpdf_df, y=Y, Y_metadata=Y_metadata)
         d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(dlogpdf_df, d2logpdf_df2, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
     def t_d3logpdf_df3(self, model, Y, f, Y_metadata):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         d2logpdf_df2 = functools.partial(model.d2logpdf_df2, y=Y, Y_metadata=Y_metadata)
         d3logpdf_df3 = functools.partial(model.d3logpdf_df3, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(d2logpdf_df2, d3logpdf_df3, f.copy(), 'g')
         grad.randomize()
-        print model
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     ##############
@@ -417,8 +417,8 @@ class TestNoiseModels(object):
     ##############
     @with_setup(setUp, tearDown)
     def t_dlogpdf_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.logpdf, model.dlogpdf_dtheta,
                     params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -427,8 +427,8 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_dlogpdf_df_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.dlogpdf_df, model.dlogpdf_df_dtheta,
                     params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -437,8 +437,8 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_d2logpdf2_df2_dparams(self, model, Y, f, Y_metadata, params, params_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.d2logpdf_df2, model.d2logpdf_df2_dtheta,
                     params, params_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -450,7 +450,7 @@ class TestNoiseModels(object):
     ################
     @with_setup(setUp, tearDown)
     def t_dlogpdf_dlink(self, model, Y, f, Y_metadata, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         logpdf = functools.partial(model.logpdf_link, y=Y, Y_metadata=Y_metadata)
         dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(logpdf, dlogpdf_dlink, f.copy(), 'g')
@@ -460,13 +460,13 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
     def t_d2logpdf_dlink2(self, model, Y, f, Y_metadata, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         dlogpdf_dlink = functools.partial(model.dlogpdf_dlink, y=Y, Y_metadata=Y_metadata)
         d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(dlogpdf_dlink, d2logpdf_dlink2, f.copy(), 'g')
@@ -476,13 +476,13 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     @with_setup(setUp, tearDown)
     def t_d3logpdf_dlink3(self, model, Y, f, Y_metadata, link_f_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         d2logpdf_dlink2 = functools.partial(model.d2logpdf_dlink2, y=Y, Y_metadata=Y_metadata)
         d3logpdf_dlink3 = functools.partial(model.d3logpdf_dlink3, y=Y, Y_metadata=Y_metadata)
         grad = GradientChecker(d2logpdf_dlink2, d3logpdf_dlink3, f.copy(), 'g')
@@ -492,8 +492,8 @@ class TestNoiseModels(object):
             constraint('g', grad)
 
         grad.randomize()
-        print grad
-        print model
+        print(grad)
+        print(model)
         assert grad.checkgrad(verbose=1)
 
     #################
@@ -501,8 +501,8 @@ class TestNoiseModels(object):
     #################
     @with_setup(setUp, tearDown)
     def t_dlogpdf_link_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.logpdf_link, model.dlogpdf_link_dtheta,
                     params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -511,8 +511,8 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_dlogpdf_dlink_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.dlogpdf_dlink, model.dlogpdf_dlink_dtheta,
                     params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -521,8 +521,8 @@ class TestNoiseModels(object):
 
     @with_setup(setUp, tearDown)
     def t_d2logpdf2_dlink2_dparams(self, model, Y, f, Y_metadata, params, param_names, param_constraints):
-        print "\n{}".format(inspect.stack()[0][3])
-        print model
+        print("\n{}".format(inspect.stack()[0][3]))
+        print(model)
         assert (
                 dparam_checkgrad(model.d2logpdf_dlink2, model.d2logpdf_dlink2_dtheta,
                     params, param_names, args=(f, Y, Y_metadata), constraints=param_constraints,
@@ -534,7 +534,7 @@ class TestNoiseModels(object):
     ################
     @with_setup(setUp, tearDown)
     def t_laplace_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         #Normalize
         Y = Y/Y.max()
         white_var = 1e-5
@@ -548,7 +548,7 @@ class TestNoiseModels(object):
         for constrain_param, constraint in constraints:
             constraint(constrain_param, m)
 
-        print m
+        print(m)
         m.randomize()
         m.randomize()
 
@@ -558,7 +558,7 @@ class TestNoiseModels(object):
             m[name] = param_vals[param_num]
 
         #m.optimize(max_iters=8)
-        print m
+        print(m)
         #if not m.checkgrad(step=step):
             #m.checkgrad(verbose=1, step=step)
             #NOTE this test appears to be stochastic for some likelihoods (student t?)
@@ -571,7 +571,7 @@ class TestNoiseModels(object):
     ###########
     @with_setup(setUp, tearDown)
     def t_ep_fit_rbf_white(self, model, X, Y, f, Y_metadata, step, param_vals, param_names, constraints):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         #Normalize
         Y = Y/Y.max()
         white_var = 1e-6
@@ -587,7 +587,7 @@ class TestNoiseModels(object):
             constraints[param_num](name, m)
 
         m.randomize()
-        print m
+        print(m)
         assert m.checkgrad(verbose=1, step=step)
 
 
@@ -624,7 +624,7 @@ class LaplaceTests(unittest.TestCase):
         self.X = None
 
     def test_gaussian_d2logpdf_df2_2(self):
-        print "\n{}".format(inspect.stack()[0][3])
+        print("\n{}".format(inspect.stack()[0][3]))
         self.Y = None
 
         self.N = 2
@@ -673,17 +673,17 @@ class LaplaceTests(unittest.TestCase):
         m2.randomize()
 
         if debug:
-            print m1
-            print m2
+            print(m1)
+            print(m2)
 
         optimizer = 'scg'
-        print "Gaussian"
+        print("Gaussian")
         m1.optimize(optimizer, messages=debug, ipython_notebook=False)
-        print "Laplace Gaussian"
+        print("Laplace Gaussian")
         m2.optimize(optimizer, messages=debug, ipython_notebook=False)
         if debug:
-            print m1
-            print m2
+            print(m1)
+            print(m2)
 
         m2[:] = m1[:]
 
@@ -730,5 +730,5 @@ class LaplaceTests(unittest.TestCase):
         self.assertTrue(m2.checkgrad(verbose=True))
 
 if __name__ == "__main__":
-    print "Running unit tests"
+    print("Running unit tests")
     unittest.main()
diff --git a/GPy/testing/mapping_tests.py b/GPy/testing/mapping_tests.py
index 2e32dad3..2ff0e2d8 100644
--- a/GPy/testing/mapping_tests.py
+++ b/GPy/testing/mapping_tests.py
@@ -26,11 +26,6 @@ class MappingGradChecker(GPy.core.Model):
         self.mapping.update_gradients(self.dL_dY, self.X)
 
 
-
-
-
-
-
 class MappingTests(unittest.TestCase):
 
     def test_kernelmapping(self):
@@ -68,5 +63,5 @@ class MappingTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     unittest.main()
diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py
index 5950de08..ce78ee88 100644
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@@ -153,19 +153,19 @@ class MiscTests(unittest.TestCase):
     def test_big_model(self):
         m = GPy.examples.dimensionality_reduction.mrd_simulation(optimize=0, plot=0, plot_sim=0)
         m.X.fix()
-        print m
+        print(m)
         m.unfix()
         m.checkgrad()
-        print m
+        print(m)
         m.fix()
-        print m
+        print(m)
         m.inducing_inputs.unfix()
-        print m
+        print(m)
         m.checkgrad()
         m.unfix()
         m.checkgrad()
         m.checkgrad()
-        print m
+        print(m)
 
     def test_model_set_params(self):
         m = GPy.models.GPRegression(self.X, self.Y)
@@ -176,7 +176,7 @@ class MiscTests(unittest.TestCase):
         m['.*var'] -= .1
         np.testing.assert_equal(m.kern.lengthscale, lengthscale)
         m.optimize()
-        print m
+        print(m)
 
     def test_model_updates(self):
         Y1 = np.random.normal(0, 1, (40, 13))
@@ -201,7 +201,7 @@ class MiscTests(unittest.TestCase):
         Y = np.sin(X) + np.random.randn(20, 1) * 0.05
         m = GPy.models.GPRegression(X, Y)
         m.optimize()
-        print m
+        print(m)
 
 class GradientTests(np.testing.TestCase):
     def setUp(self):
@@ -523,5 +523,5 @@ class GradientTests(np.testing.TestCase):
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     unittest.main()
diff --git a/GPy/testing/mpi_tests.py b/GPy/testing/mpi_tests.py
index 5c489032..28a23288 100644
--- a/GPy/testing/mpi_tests.py
+++ b/GPy/testing/mpi_tests.py
@@ -84,7 +84,7 @@ except:
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     try:
         import mpi4py
         unittest.main()
diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py
index 7c4f4ce2..0fb129ff 100644
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@@ -12,6 +12,7 @@ from GPy.core.parameterization.transformations import NegativeLogexp, Logistic
 from GPy.core.parameterization.parameterized import Parameterized
 from GPy.core.parameterization.param import Param
 from GPy.core.parameterization.index_operations import ParameterIndexOperations
+from functools import reduce
 
 class ArrayCoreTest(unittest.TestCase):
     def setUp(self):
@@ -107,7 +108,7 @@ class ParameterizedTest(unittest.TestCase):
         self.assertListEqual(self.white._fixes_.tolist(), [FIXED])
         self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
         self.assertIs(self.test1.constraints, self.param.constraints._param_index_ops)
-        self.assertListEqual(self.test1.constraints[Logexp()].tolist(), range(self.param.size, self.param.size+self.rbf.size))
+        self.assertListEqual(self.test1.constraints[Logexp()].tolist(), list(range(self.param.size, self.param.size+self.rbf.size)))
 
     def test_remove_parameter_param_array_grad_array(self):
         val = self.test1.kern.param_array.copy()
@@ -120,15 +121,15 @@ class ParameterizedTest(unittest.TestCase):
     def test_default_constraints(self):
         self.assertIs(self.rbf.variance.constraints._param_index_ops, self.rbf.constraints._param_index_ops)
         self.assertIs(self.test1.constraints, self.rbf.constraints._param_index_ops)
-        self.assertListEqual(self.rbf.constraints.indices()[0].tolist(), range(2))
+        self.assertListEqual(self.rbf.constraints.indices()[0].tolist(), list(range(2)))
         from GPy.core.parameterization.transformations import Logexp
         kern = self.test1.kern
         self.test1.unlink_parameter(kern)
-        self.assertListEqual(kern.constraints[Logexp()].tolist(), range(3))
+        self.assertListEqual(kern.constraints[Logexp()].tolist(), list(range(3)))
 
     def test_constraints(self):
         self.rbf.constrain(GPy.transformations.Square(), False)
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Square()].tolist(), range(self.param.size, self.param.size+self.rbf.size))
+        self.assertListEqual(self.test1.constraints[GPy.transformations.Square()].tolist(), list(range(self.param.size, self.param.size+self.rbf.size)))
         self.assertListEqual(self.test1.constraints[GPy.transformations.Logexp()].tolist(), [self.param.size+self.rbf.size])
 
         self.test1.kern.unlink_parameter(self.rbf)
@@ -181,8 +182,8 @@ class ParameterizedTest(unittest.TestCase):
 
     def test_add_parameter_in_hierarchy(self):
         self.test1.kern.rbf.link_parameter(Param("NEW", np.random.rand(2), NegativeLogexp()), 1)
-        self.assertListEqual(self.test1.constraints[NegativeLogexp()].tolist(), range(self.param.size+1, self.param.size+1 + 2))
-        self.assertListEqual(self.test1.constraints[GPy.transformations.Logistic(0,1)].tolist(), range(self.param.size))
+        self.assertListEqual(self.test1.constraints[NegativeLogexp()].tolist(), list(range(self.param.size+1, self.param.size+1 + 2)))
+        self.assertListEqual(self.test1.constraints[GPy.transformations.Logistic(0,1)].tolist(), list(range(self.param.size)))
         self.assertListEqual(self.test1.constraints[GPy.transformations.Logexp(0,1)].tolist(), np.r_[50, 53:55].tolist())
 
     def test_regular_expression_misc(self):
@@ -240,7 +241,7 @@ class ParameterizedTest(unittest.TestCase):
                 self.p2.constrain_positive()
 
         m = TestLikelihood()
-        print m
+        print(m)
         val = m.p1.values.copy()
         self.assert_(m.p1.is_fixed)
         self.assert_(m.constraints[GPy.constraints.Logexp()].tolist(), [1])
@@ -248,9 +249,9 @@ class ParameterizedTest(unittest.TestCase):
         self.assertEqual(m.p1, val)
 
     def test_printing(self):
-        print self.test1
-        print self.param
-        print self.test1['']
+        print(self.test1)
+        print(self.param)
+        print(self.test1[''])
 
 if __name__ == "__main__":
     #import sys;sys.argv = ['', 'Test.test_add_parameter']
diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py
index c79e9914..fd1bf93c 100644
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@@ -19,6 +19,7 @@ from GPy.kern._src.static import Bias, White
 from GPy.examples.dimensionality_reduction import mrd_simulation
 from GPy.core.parameterization.variational import NormalPosterior
 from GPy.models.gp_regression import GPRegression
+from functools import reduce
 
 def toy_model():
     X = np.linspace(0,1,50)[:, None]
@@ -28,18 +29,25 @@ def toy_model():
 
 class ListDictTestCase(unittest.TestCase):
     def assertListDictEquals(self, d1, d2, msg=None):
-        for k,v in d1.iteritems():
+        #py3 fix
+        #for k,v in d1.iteritems():
+        for k,v in d1.items():
             self.assertListEqual(list(v), list(d2[k]), msg)
     def assertArrayListEquals(self, l1, l2):
-        for a1, a2 in itertools.izip(l1,l2):
+        for a1, a2 in zip(l1,l2):
             np.testing.assert_array_equal(a1, a2)
 
 class Test(ListDictTestCase):
     def test_parameter_index_operations(self):
         pio = ParameterIndexOperations(dict(test1=np.array([4,3,1,6,4]), test2=np.r_[2:130]))
         piov = ParameterIndexOperationsView(pio, 20, 250)
-        self.assertListDictEquals(dict(piov.items()), dict(piov.copy().iteritems()))
-        self.assertListDictEquals(dict(pio.iteritems()), dict(pio.copy().items()))
+        #py3 fix
+        #self.assertListDictEquals(dict(piov.items()), dict(piov.copy().iteritems()))
+        self.assertListDictEquals(dict(piov.items()), dict(piov.copy().items()))
+
+        #py3 fix
+        #self.assertListDictEquals(dict(pio.iteritems()), dict(pio.copy().items()))
+        self.assertListDictEquals(dict(pio.items()), dict(pio.copy().items()))
 
         self.assertArrayListEquals(pio.copy().indices(), pio.indices())
         self.assertArrayListEquals(piov.copy().indices(), piov.indices())
@@ -54,7 +62,9 @@ class Test(ListDictTestCase):
             pickle.dump(piov, f)
             f.seek(0)
             pio2 = pickle.load(f)
-            self.assertListDictEquals(dict(piov.items()), dict(pio2.iteritems()))
+            #py3 fix
+            #self.assertListDictEquals(dict(piov.items()), dict(pio2.iteritems()))
+            self.assertListDictEquals(dict(piov.items()), dict(pio2.items()))
 
     def test_param(self):
         param = Param('test', np.arange(4*2).reshape(4,2))
diff --git a/GPy/testing/prior_tests.py b/GPy/testing/prior_tests.py
index 6a61fbb5..ca03ad93 100644
--- a/GPy/testing/prior_tests.py
+++ b/GPy/testing/prior_tests.py
@@ -110,5 +110,5 @@ class PriorTests(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    print "Running unit tests, please be (very) patient..."
+    print("Running unit tests, please be (very) patient...")
     unittest.main()
diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py
index c3edfc48..e8d2456e 100644
--- a/GPy/util/__init__.py
+++ b/GPy/util/__init__.py
@@ -2,18 +2,18 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 
-import linalg
-import misc
-import squashers
-import warping_functions
-import datasets
-import mocap
-import decorators
-import classification
-import subarray_and_sorting
-import caching
-import diag
-import initialization
-import multioutput
-import linalg_gpu
+from . import linalg
+from . import misc
+from . import squashers
+from . import warping_functions
+from . import datasets
+from . import mocap
+from . import decorators
+from . import classification
+from . import subarray_and_sorting
+from . import caching
+from . import diag
+from . import initialization
+from . import multioutput
+from . import linalg_gpu
 
diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py
index 464e3ba1..a047abc6 100644
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@@ -71,6 +71,6 @@ if __name__=='__main__':
     A = np.zeros((5,5))
     B = get_blocks(A,[2,3])
     B[0,0] += 7
-    print B
+    print(B)
 
     assert np.all(unblock(B) == A)
diff --git a/GPy/util/caching.py b/GPy/util/caching.py
index 16adc320..196ce343 100644
--- a/GPy/util/caching.py
+++ b/GPy/util/caching.py
@@ -2,6 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from ..core.parameterization.observable import Observable
 import collections, weakref
+from functools import reduce
 
 class Cacher(object):
     def __init__(self, operation, limit=5, ignore_args=(), force_kwargs=()):
@@ -148,10 +149,10 @@ class Cacher(object):
         return Cacher(self.operation, self.limit, self.ignore_args, self.force_kwargs)
 
     def __getstate__(self, memo=None):
-        raise NotImplementedError, "Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation))
+        raise NotImplementedError("Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation)))
 
     def __setstate__(self, memo=None):
-        raise NotImplementedError, "Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation))
+        raise NotImplementedError("Trying to pickle Cacher object with function {}, pickling functions not possible.".format(str(self.operation)))
 
     @property
     def __name__(self):
diff --git a/GPy/util/choleskies.py b/GPy/util/choleskies.py
index 3f37fc3f..37ac7211 100644
--- a/GPy/util/choleskies.py
+++ b/GPy/util/choleskies.py
@@ -2,23 +2,28 @@
 # Licensed under the GNU GPL version 3.0
 
 import numpy as np
-from scipy import weave
-import linalg
+from . import linalg
+from .config import config
 
+try:
+    from scipy import weave
+except ImportError:
+    config.set('weave', 'working', 'False')
 
 def safe_root(N):
     i = np.sqrt(N)
     j = int(i)
     if i != j:
-        raise ValueError, "N is not square!"
+        raise ValueError("N is not square!")
     return j
 
-def flat_to_triang(flat):
+def _flat_to_triang_weave(flat):
     """take a matrix N x D and return a M X M x D array where
 
     N = M(M+1)/2
 
     the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
+    This is the weave implementation
     """
     N, D = flat.shape
     M = (-1 + safe_root(8*N+1))/2
@@ -42,7 +47,24 @@ def flat_to_triang(flat):
     weave.inline(code, ['flat', 'ret', 'D', 'M'])
     return ret
 
-def triang_to_flat(L):
+def _flat_to_triang_pure(flat_mat):
+    N, D = flat_mat.shape
+    M = (-1 + safe_root(8*N+1))//2
+    ret = np.zeros((M, M, D))
+    count = 0
+    for m in range(M):
+        for mm in range(m+1):
+            for d in range(D):
+              ret.flat[d + m*D*M + mm*D] = flat_mat.flat[count];
+              count = count+1
+    return ret
+
+if config.getboolean('weave', 'working'):
+	flat_to_triang =  _flat_to_triang_weave
+else:
+        flat_to_triang =  _flat_to_triang_pure
+
+def _triang_to_flat_weave(L):
     M, _, D = L.shape
 
     L = np.ascontiguousarray(L) # should do nothing if L was created by flat_to_triang
@@ -66,13 +88,31 @@ def triang_to_flat(L):
     weave.inline(code, ['flat', 'L', 'D', 'M'])
     return flat
 
+def _triang_to_flat_pure(L):
+    M, _, D = L.shape
+
+    N = M*(M+1)//2
+    flat = np.empty((N, D))
+    count = 0;
+    for m in range(M):
+        for mm in range(m+1):
+            for d in range(D):
+                flat.flat[count] = L.flat[d + m*D*M + mm*D];
+                count = count +1
+    return flat
+
+if config.getboolean('weave', 'working'):
+    triang_to_flat =  _triang_to_flat_weave
+else:
+    triang_to_flat =  _triang_to_flat_pure
+
 def triang_to_cov(L):
-    return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in xrange(L.shape[-1])])
+    return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in range(L.shape[-1])])
 
 def multiple_dpotri_old(Ls):
     M, _, D = Ls.shape
     Kis = np.rollaxis(Ls, -1).copy()
-    [dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in xrange(D)]
+    [dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in range(D)]
     code = """
     for(int d=0; d<D; d++)
     {
@@ -93,9 +133,6 @@ def multiple_dpotri_old(Ls):
 def multiple_dpotri(Ls):
     return np.dstack([linalg.dpotri(np.asfortranarray(Ls[:,:,i]), lower=1)[0] for i in range(Ls.shape[-1])])
 
-
-
-
 def indexes_to_fix_for_low_rank(rank, size):
     """
     work out which indexes of the flatteneed array should be fixed if we want the cholesky to represent a low rank matrix
diff --git a/GPy/util/classification.py b/GPy/util/classification.py
index c0859793..69609091 100644
--- a/GPy/util/classification.py
+++ b/GPy/util/classification.py
@@ -25,9 +25,9 @@ def conf_matrix(p,labels,names=['1','0'],threshold=.5,show=True):
     true_0 = labels.size - true_1 - false_0 - false_1
     error = (false_1 + false_0)/np.float(labels.size)
     if show:
-        print 100. - error * 100,'% instances correctly classified'
-        print '%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1])
-        print '----------|------------|------------|'
-        print '%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0)
-        print '%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0)
+        print(100. - error * 100,'% instances correctly classified')
+        print('%-10s|  %-10s|  %-10s| ' % ('',names[0],names[1]))
+        print('----------|------------|------------|')
+        print('%-10s|  %-10s|  %-10s| ' % (names[0],true_1,false_0))
+        print('%-10s|  %-10s|  %-10s| ' % (names[1],false_1,true_0))
     return error,true_1, false_1, true_0, false_0
diff --git a/GPy/util/config.py b/GPy/util/config.py
index 6dad46c8..312d6991 100644
--- a/GPy/util/config.py
+++ b/GPy/util/config.py
@@ -1,9 +1,18 @@
 #
 # This loads the configuration
 #
-import ConfigParser
 import os
-config = ConfigParser.ConfigParser()
+try:
+    #Attempt Python 2 ConfigParser setup
+    import ConfigParser
+    config = ConfigParser.ConfigParser()
+except ImportError:
+    #Attempt Python 3 ConfigParser setup
+    import configparser
+    config = configparser.ConfigParser()
+    
+
+    
 
 # This is the default configuration file that always needs to be present.
 default_file = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..', 'defaults.cfg'))
@@ -20,4 +29,4 @@ user_file = os.path.join(home,'.gpy_user.cfg')
 config.readfp(open(default_file))
 config.read([local_file, user_file])
 if not config:
-    raise ValueError, "No configuration file found at either " + user_file + " or " + local_file + " or " + default_file + "."
+    raise ValueError("No configuration file found at either " + user_file + " or " + local_file + " or " + default_file + ".")
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 254639a6..57755ea9 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -1,17 +1,17 @@
+from __future__ import print_function
 import csv
 import os
 import copy
 import numpy as np
 import GPy
 import scipy.io
-import cPickle as pickle
 import zipfile
 import tarfile
 import datetime
 import json
 import re
-
-from config import *
+import sys
+from .config import *
 
 ipython_available=True
 try:
@@ -19,8 +19,20 @@ try:
 except ImportError:
     ipython_available=False
 
+try:
+    #In Python 2, cPickle is faster. It does not exist in Python 3 but the underlying code is always used
+    #if available
+    import cPickle as pickle
+except ImportError:
+    import pickle
 
-import sys, urllib2
+#A Python2/3 import handler - urllib2 changed its name in Py3 and was also reorganised
+try:
+    from urllib2 import urlopen
+    from urllib2 import URLError
+except ImportError:
+    from urllib.request import urlopen
+    from urllib.error import URLError
 
 def reporthook(a,b,c):
     # ',' at the end of the line is important!
@@ -75,7 +87,7 @@ def prompt_user(prompt):
     elif choice in no:
         return False
     else:
-        print("Your response was a " + choice)
+        print(("Your response was a " + choice))
         print("Please respond with 'yes', 'y' or 'no', 'n'")
         #return prompt_user()
 
@@ -99,7 +111,7 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
     """Download a file from a url and save it to disk."""
     i = url.rfind('/')
     file = url[i+1:]
-    print file
+    print(file)
     dir_name = os.path.join(data_path, store_directory)
 
     if save_name is None: save_name = os.path.join(dir_name, file)
@@ -107,12 +119,12 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
 
     if suffix is None: suffix=''
 
-    print "Downloading ", url, "->", save_name
+    print("Downloading ", url, "->", save_name)
     if not os.path.exists(dir_name):
         os.makedirs(dir_name)
     try:
-        response = urllib2.urlopen(url+suffix)
-    except urllib2.URLError, e:
+        response = urlopen(url+suffix)
+    except URLError as e:
         if not hasattr(e, "code"):
             raise
         response = e
@@ -150,7 +162,7 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
             sys.stdout.write(status)
             sys.stdout.flush()
         sys.stdout.write(" "*(len(status)) + "\r")
-        print status
+        print(status)
     # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
     #with open(save_name, 'wb') as f:
     #    f.write(response.read())
@@ -159,32 +171,32 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
 
 def authorize_download(dataset_name=None):
     """Check with the user that the are happy with terms and conditions for the data set."""
-    print('Acquiring resource: ' + dataset_name)
+    print(('Acquiring resource: ' + dataset_name))
     # TODO, check resource is in dictionary!
     print('')
     dr = data_resources[dataset_name]
     print('Details of data: ')
-    print(dr['details'])
+    print((dr['details']))
     print('')
     if dr['citation']:
         print('Please cite:')
-        print(dr['citation'])
+        print((dr['citation']))
         print('')
     if dr['size']:
-        print('After downloading the data will take up ' + str(dr['size']) + ' bytes of space.')
+        print(('After downloading the data will take up ' + str(dr['size']) + ' bytes of space.'))
         print('')
-    print('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.')
+    print(('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.'))
     print('')
     if overide_manual_authorize:
         if dr['license']:
             print('You have agreed to the following license:')
-            print(dr['license'])
+            print((dr['license']))
             print('')
         return True
     else:
         if dr['license']:
             print('You must also agree to the following license:')
-            print(dr['license'])
+            print((dr['license']))
             print('')
         return prompt_user('Do you wish to proceed with the download? [yes/no]')
 
@@ -495,18 +507,18 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'],
     file = 'data.csv'
     file_name = os.path.join(dir_path,file)
     if not os.path.exists(file_name) or refresh_data:
-        print "Accessing Google trends to acquire the data. Note that repeated accesses will result in a block due to a google terms of service violation. Failure at this point may be due to such blocks."
+        print("Accessing Google trends to acquire the data. Note that repeated accesses will result in a block due to a google terms of service violation. Failure at this point may be due to such blocks.")
         # quote the query terms.
         quoted_terms = []
         for term in query_terms:
             quoted_terms.append(urllib2.quote(term))
-        print "Query terms: ", ', '.join(query_terms)
+        print("Query terms: ", ', '.join(query_terms))
 
-        print "Fetching query:"
+        print("Fetching query:")
         query = 'http://www.google.com/trends/fetchComponent?q=%s&cid=TIMESERIES_GRAPH_0&export=3' % ",".join(quoted_terms)
 
-        data = urllib2.urlopen(query).read()
-        print "Done."
+        data = urlopen(query).read()
+        print("Done.")
         # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD.
         header = """// Data table response\ngoogle.visualization.Query.setResponse("""
         data = data[len(header):-2]
@@ -520,8 +532,8 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'],
 
         df.to_csv(file_name)
     else:
-        print "Reading cached data for google trends. To refresh the cache set 'refresh_data=True' when calling this function."
-        print "Query terms: ", ', '.join(query_terms)
+        print("Reading cached data for google trends. To refresh the cache set 'refresh_data=True' when calling this function.")
+        print("Query terms: ", ', '.join(query_terms))
 
         df = pandas.read_csv(file_name, parse_dates=[0])
 
@@ -679,11 +691,11 @@ def ripley_synth(data_set='ripley_prnn_data'):
 def global_average_temperature(data_set='global_temperature', num_train=1000, refresh_data=False):
     path = os.path.join(data_path, data_set)
     if data_available(data_set) and not refresh_data:
-        print 'Using cached version of the data set, to use latest version set refresh_data to True'
+        print('Using cached version of the data set, to use latest version set refresh_data to True')
     else:
         download_data(data_set)
     data = np.loadtxt(os.path.join(data_path, data_set, 'GLBTS.long.data'))
-    print 'Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0]
+    print('Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0])
     allX = data[data[:, 3]!=-99.99, 2:3]
     allY = data[data[:, 3]!=-99.99, 3:4]
     X = allX[:num_train, 0:1]
@@ -695,11 +707,11 @@ def global_average_temperature(data_set='global_temperature', num_train=1000, re
 def mauna_loa(data_set='mauna_loa', num_train=545, refresh_data=False):
     path = os.path.join(data_path, data_set)
     if data_available(data_set) and not refresh_data:
-        print 'Using cached version of the data set, to use latest version set refresh_data to True'
+        print('Using cached version of the data set, to use latest version set refresh_data to True')
     else:
         download_data(data_set)
     data = np.loadtxt(os.path.join(data_path, data_set, 'co2_mm_mlo.txt'))
-    print 'Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0]
+    print('Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0])
     allX = data[data[:, 3]!=-99.99, 2:3]
     allY = data[data[:, 3]!=-99.99, 3:4]
     X = allX[:num_train, 0:1]
@@ -784,7 +796,7 @@ def hapmap3(data_set='hapmap3'):
         from sys import stdout
         import bz2
     except ImportError as i:
-        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
+        raise i("Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset")
 
     dir_path = os.path.join(data_path,'hapmap3')
     hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
@@ -802,10 +814,10 @@ def hapmap3(data_set='hapmap3'):
     if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
         if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
                             "of memory and can take a (very) long time, continue? [Y/n]"):
-            print "Preprocessing required for further usage."
+            print("Preprocessing required for further usage.")
             return
         status = "Preprocessing data, please be patient..."
-        print status
+        print(status)
         def write_status(message, progress, status):
             stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
             status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
@@ -873,13 +885,13 @@ def hapmap3(data_set='hapmap3'):
         inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
         inandf.to_pickle(preprocessed_data_paths[2])
         status=write_status('done :)', 100, status)
-        print ''
+        print('')
     else:
-        print "loading snps..."
+        print("loading snps...")
         snpsdf = read_pickle(preprocessed_data_paths[0])
-        print "loading metainfo..."
+        print("loading metainfo...")
         metadf = read_pickle(preprocessed_data_paths[1])
-        print "loading nan entries..."
+        print("loading nan entries...")
         inandf = read_pickle(preprocessed_data_paths[2])
     snps = snpsdf.values
     populations = metadf.population.values.astype('S3')
@@ -1001,7 +1013,7 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
     # Extract the tar file
     filename = os.path.join(dir_path, 'GSE45719_Raw.tar')
     with tarfile.open(filename, 'r') as files:
-        print "Extracting Archive {}...".format(files.name)
+        print("Extracting Archive {}...".format(files.name))
         data = None
         gene_info = None
         message = ''
@@ -1010,9 +1022,9 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
         for i, file_info in enumerate(members):
             f = files.extractfile(file_info)
             inner = read_csv(f, sep='\t', header=0, compression='gzip', index_col=0)
-            print ' '*(len(message)+1) + '\r',
+            print(' '*(len(message)+1) + '\r', end=' ')
             message = "{: >7.2%}: Extracting: {}".format(float(i+1)/overall, file_info.name[:20]+"...txt.gz")
-            print message,
+            print(message, end=' ')
             if data is None:
                 data = inner.RPKM.to_frame()
                 data.columns = [file_info.name[:-18]]
@@ -1035,8 +1047,8 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
 
     sys.stdout.write(' '*len(message) + '\r')
     sys.stdout.flush()
-    print
-    print "Read Archive {}".format(files.name)
+    print()
+    print("Read Archive {}".format(files.name))
 
     return data_details_return({'Y': data,
                                 'series_info': info,
diff --git a/GPy/util/debug.py b/GPy/util/debug.py
index 00107f5e..d691ad82 100644
--- a/GPy/util/debug.py
+++ b/GPy/util/debug.py
@@ -13,7 +13,7 @@ def checkFinite(arr, name=None):
 
     if np.any(np.logical_not(np.isfinite(arr))):
         idx = np.where(np.logical_not(np.isfinite(arr)))[0]
-        print name+' at indices '+str(idx)+' have not finite values: '+str(arr[idx])+'!'
+        print(name+' at indices '+str(idx)+' have not finite values: '+str(arr[idx])+'!')
         return False
     return True
 
@@ -23,13 +23,13 @@ def checkFullRank(m, tol=1e-10, name=None, force_check=False):
     assert len(m.shape)==2 and m.shape[0]==m.shape[1], 'The input of checkFullRank has to be a square matrix!'
 
     if not force_check and m.shape[0]>=10000:
-        print 'The size of '+name+'is too big to check (>=10000)!'
+        print('The size of '+name+'is too big to check (>=10000)!')
         return True
 
     s = np.real(np.linalg.eigvals(m))
 
     if s.min()/s.max()<tol:
-        print name+' is close to singlar!'
-        print 'The eigen values of '+name+' is '+str(s)
+        print(name+' is close to singlar!')
+        print('The eigen values of '+name+' is '+str(s))
         return False
     return True
diff --git a/GPy/util/gpu_init.py b/GPy/util/gpu_init.py
index b6a4a164..26dff0b3 100644
--- a/GPy/util/gpu_init.py
+++ b/GPy/util/gpu_init.py
@@ -23,7 +23,7 @@ try:
         import pycuda.driver
         pycuda.driver.init()
         if gpuid>=pycuda.driver.Device.count():
-            print '['+MPI.Get_processor_name()+'] more processes than the GPU numbers!'
+            print('['+MPI.Get_processor_name()+'] more processes than the GPU numbers!')
             #MPI.COMM_WORLD.Abort()
             raise
         gpu_device = pycuda.driver.Device(gpuid)
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 1089b557..8ac5418f 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -6,16 +6,22 @@
 # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot.py
 
 import numpy as np
-from scipy import linalg, weave
+from scipy import linalg
 import types
 import ctypes
 from ctypes import byref, c_char, c_int, c_double # TODO
 import scipy
 import warnings
 import os
-from config import config
+from .config import config
 import logging
 
+try:
+    from scipy import weave
+except ImportError:
+    config.set('weave', 'working', 'False')
+    
+
 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
 _fix_dpotri_scipy_bug = True
 if np.all(_scipyversion >= np.array([0, 14])):
@@ -34,7 +40,7 @@ if config.getboolean('anaconda', 'installed') and config.getboolean('anaconda',
         dsyrk = mkl_rt.dsyrk
         dsyr = mkl_rt.dsyr
         _blas_available = True
-        print 'anaconda installed and mkl is loaded'
+        print('anaconda installed and mkl is loaded')
     except:
         _blas_available = False
 else:
@@ -64,7 +70,7 @@ def force_F_ordered(A):
     """
     if A.flags['F_CONTIGUOUS']:
         return A
-    print "why are your arrays not F order?"
+    print("why are your arrays not F order?")
     return np.asfortranarray(A)
 
 # def jitchol(A, maxtries=5):
@@ -91,19 +97,19 @@ def jitchol(A, maxtries=5):
     else:
         diagA = np.diag(A)
         if np.any(diagA <= 0.):
-            raise linalg.LinAlgError, "not pd: non-positive diagonal elements"
+            raise linalg.LinAlgError("not pd: non-positive diagonal elements")
         jitter = diagA.mean() * 1e-6
         num_tries = 1
         while num_tries <= maxtries and np.isfinite(jitter):
             try:
-                print jitter
+                print(jitter)
                 L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
                 return L
             except:
                 jitter *= 10
             finally:
                 num_tries += 1
-        raise linalg.LinAlgError, "not positive definite, even with jitter."
+        raise linalg.LinAlgError("not positive definite, even with jitter.")
     import traceback
     try: raise
     except:
@@ -213,12 +219,12 @@ def mdot(*args):
 
 def _mdot_r(a, b):
     """Recursive helper for mdot"""
-    if type(a) == types.TupleType:
+    if type(a) == tuple:
         if len(a) > 1:
             a = mdot(*a)
         else:
             a = a[0]
-    if type(b) == types.TupleType:
+    if type(b) == tuple:
         if len(b) > 1:
             b = mdot(*b)
         else:
@@ -293,7 +299,7 @@ def pca(Y, input_dim):
 
     """
     if not np.allclose(Y.mean(axis=0), 0.0):
-        print "Y is not zero mean, centering it locally (GPy.util.linalg.pca)"
+        print("Y is not zero mean, centering it locally (GPy.util.linalg.pca)")
 
         # Y -= Y.mean(axis=0)
 
@@ -352,16 +358,16 @@ def tdot_blas(mat, out=None):
     # of C order. However, I tried that and had errors with large matrices:
     # http://homepages.inf.ed.ac.uk/imurray2/code/tdot/tdot_broken.py
     mat = np.asfortranarray(mat)
-    TRANS = c_char('n')
+    TRANS = c_char('n'.encode('ascii'))
     N = c_int(mat.shape[0])
     K = c_int(mat.shape[1])
     LDA = c_int(mat.shape[0])
-    UPLO = c_char('l')
+    UPLO = c_char('l'.encode('ascii'))
     ALPHA = c_double(1.0)
     A = mat.ctypes.data_as(ctypes.c_void_p)
     BETA = c_double(0.0)
     C = out.ctypes.data_as(ctypes.c_void_p)
-    LDC = c_int(np.max(out.strides) / 8)
+    LDC = c_int(np.max(out.strides) // 8)
     dsyrk(byref(UPLO), byref(TRANS), byref(N), byref(K),
             byref(ALPHA), A, byref(LDA), byref(BETA), C, byref(LDC))
 
@@ -388,7 +394,7 @@ def DSYR_blas(A, x, alpha=1.):
     """
     N = c_int(A.shape[0])
     LDA = c_int(A.shape[0])
-    UPLO = c_char('l')
+    UPLO = c_char('l'.encode('ascii'))
     ALPHA = c_double(alpha)
     A_ = A.ctypes.data_as(ctypes.c_void_p)
     x_ = x.ctypes.data_as(ctypes.c_void_p)
@@ -428,7 +434,7 @@ def symmetrify(A, upper=False):
         try:
             symmetrify_weave(A, upper)
         except:
-            print "\n Weave compilation failed. Falling back to (slower) numpy implementation\n"
+            print("\n Weave compilation failed. Falling back to (slower) numpy implementation\n")
             config.set('weave', 'working', 'False')
             symmetrify_numpy(A, upper)
     else:
@@ -494,34 +500,35 @@ def symmetrify_numpy(A, upper=False):
     else:
         A[triu] = A.T[triu]
 
-def cholupdate(L, x):
-    """
-    update the LOWER cholesky factor of a pd matrix IN PLACE
-
-    if L is the lower chol. of K, then this function computes L\_
-    where L\_ is the lower chol of K + x*x^T
-
-    """
-    support_code = """
-    #include <math.h>
-    """
-    code = """
-    double r,c,s;
-    int j,i;
-    for(j=0; j<N; j++){
-      r = sqrt(L(j,j)*L(j,j) + x(j)*x(j));
-      c = r / L(j,j);
-      s = x(j) / L(j,j);
-      L(j,j) = r;
-      for (i=j+1; i<N; i++){
-        L(i,j) = (L(i,j) + s*x(i))/c;
-        x(i) = c*x(i) - s*L(i,j);
-      }
-    }
-    """
-    x = x.copy()
-    N = x.size
-    weave.inline(code, support_code=support_code, arg_names=['N', 'L', 'x'], type_converters=weave.converters.blitz)
+#This function appears to be unused. It's use of weave makes it problematic
+#Commenting out for now
+#def cholupdate(L, x):
+#    """
+#    update the LOWER cholesky factor of a pd matrix IN PLACE
+#
+#    if L is the lower chol. of K, then this function computes L\_
+#    where L\_ is the lower chol of K + x*x^T
+#    """
+#    support_code = """
+#    #include <math.h>
+#    """
+#    code = """
+#    double r,c,s;
+#    int j,i;
+#    for(j=0; j<N; j++){
+#      r = sqrt(L(j,j)*L(j,j) + x(j)*x(j));
+#      c = r / L(j,j);
+#      s = x(j) / L(j,j);
+#      L(j,j) = r;
+#      for (i=j+1; i<N; i++){
+#        L(i,j) = (L(i,j) + s*x(i))/c;
+#        x(i) = c*x(i) - s*L(i,j);
+#      }
+#    }
+#    """
+#    x = x.copy()
+#    N = x.size
+#    weave.inline(code, support_code=support_code, arg_names=['N', 'L', 'x'], type_converters=weave.converters.blitz)
 
 def backsub_both_sides(L, X, transpose='left'):
     """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
diff --git a/GPy/util/ln_diff_erfs.py b/GPy/util/ln_diff_erfs.py
index bb9cfe03..c1137283 100644
--- a/GPy/util/ln_diff_erfs.py
+++ b/GPy/util/ln_diff_erfs.py
@@ -6,7 +6,7 @@ try:
     from scipy.special import erfcx, erf
 except ImportError:
     from scipy.special import erf
-    from erfcx import erfcx
+    from .erfcx import erfcx
 
 import numpy as np
 
@@ -35,7 +35,7 @@ def ln_diff_erfs(x1, x2, return_sign=False):
         elif x2.size==1:
             v = np.zeros(x1.shape)
         else:
-            raise ValueError, "This function does not broadcast unless provided with a scalar."
+            raise ValueError("This function does not broadcast unless provided with a scalar.")
     
     if x1.size == 1:
         x1 = np.tile(x1, x2.shape)
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index 84bf4dc1..431d6f8f 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -2,7 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from config import *
+from .config import *
 
 _lim_val = np.finfo(np.float64).max
 
@@ -24,7 +24,7 @@ def chain_1(df_dg, dg_dx):
     if np.all(dg_dx==1.):
         return df_dg
     if len(df_dg) > 1 and len(df_dg.shape)>1 and df_dg.shape[-1] > 1:
-        import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
+        #import ipdb; ipdb.set_trace()  # XXX BREAKPOINT
         raise NotImplementedError('Not implemented for matricies yet')
     return df_dg * dg_dx
 
diff --git a/GPy/util/mocap.py b/GPy/util/mocap.py
index 58662cf9..4f6336c5 100644
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@@ -2,7 +2,6 @@ import os
 import numpy as np
 import math
 from GPy.util import datasets as dat
-import urllib2
 
 class vertex:
     def __init__(self, name, id, parents=[], children=[], meta = {}):
@@ -174,7 +173,7 @@ class skeleton(tree):
         return connection
 
     def to_xyz(self, channels):
-        raise NotImplementedError, "this needs to be implemented to use the skeleton class"
+        raise NotImplementedError("this needs to be implemented to use the skeleton class")
 
 
     def finalize(self):
diff --git a/GPy/util/multioutput.py b/GPy/util/multioutput.py
index cc9af29e..2233dbb6 100644
--- a/GPy/util/multioutput.py
+++ b/GPy/util/multioutput.py
@@ -51,7 +51,7 @@ def ICM(input_dim, num_outputs, kernel, W_rank=1,W=None,kappa=None,name='ICM'):
     :param W_rank: number tuples of the corregionalization parameters 'W'
     :type W_rank: integer
     """
-    if kernel.input_dim <> input_dim:
+    if kernel.input_dim != input_dim:
         kernel.input_dim = input_dim
         warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
 
diff --git a/GPy/util/parallel.py b/GPy/util/parallel.py
index fab43936..880dae58 100644
--- a/GPy/util/parallel.py
+++ b/GPy/util/parallel.py
@@ -27,7 +27,7 @@ def divide_data(datanum, rank, size):
     
     residue = (datanum)%size
     datanum_list = np.empty((size),dtype=np.int32)
-    for i in xrange(size):
+    for i in range(size):
         if i<residue:
             datanum_list[i] = int(datanum/size)+1
         else:
@@ -38,4 +38,4 @@ def divide_data(datanum, rank, size):
     else:
         size = datanum/size
         offset = size*rank+residue
-    return offset, offset+size, datanum_list
\ No newline at end of file
+    return offset, offset+size, datanum_list
diff --git a/GPy/util/pca.py b/GPy/util/pca.py
index f87b9807..7168a28f 100644
--- a/GPy/util/pca.py
+++ b/GPy/util/pca.py
@@ -13,6 +13,7 @@ except:
 from numpy.linalg.linalg import LinAlgError
 from operator import setitem
 import itertools
+from functools import reduce
 
 class PCA(object):
     """
@@ -47,7 +48,7 @@ class PCA(object):
             X_ = numpy.ma.masked_array(X, inan)
             self.mu = X_.mean(0).base
             self.sigma = X_.std(0).base
-        reduce(lambda y,x: setitem(x[0], x[1], x[2]), itertools.izip(X.T, inan.T, self.mu), None)
+        reduce(lambda y,x: setitem(x[0], x[1], x[2]), zip(X.T, inan.T, self.mu), None)
         X = X - self.mu
         X = X / numpy.where(self.sigma == 0, 1e-30, self.sigma)
         return X
@@ -138,4 +139,4 @@ class PCA(object):
             pylab.tight_layout()
         except:
             pass
-        return plots
\ No newline at end of file
+        return plots
diff --git a/GPy/util/univariate_Gaussian.py b/GPy/util/univariate_Gaussian.py
index 79864f86..e84f071f 100644
--- a/GPy/util/univariate_Gaussian.py
+++ b/GPy/util/univariate_Gaussian.py
@@ -3,7 +3,6 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 
 import numpy as np
-from scipy import weave
 from scipy.special import ndtr as std_norm_cdf
 
 #define a standard normal pdf
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index a7547be6..8f9d232f 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -220,7 +220,7 @@ class TanhWarpingFunction_d(WarpingFunction):
             y -= update
             it += 1
         if it == max_iterations:
-            print "WARNING!!! Maximum number of iterations reached in f_inv "
+            print("WARNING!!! Maximum number of iterations reached in f_inv ")
 
         return y
 
diff --git a/README.md b/README.md
index 5e98af85..60dcbe24 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,5 @@
 # GPy
 
-
 A Gaussian processes framework in Python.
 
 * [GPy homepage](http://sheffieldml.github.io/GPy/)
@@ -11,6 +10,27 @@ A Gaussian processes framework in Python.
 
 Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png)
 
+### Python 3 Compatibility
+Work is underway to make GPy run on Python 3.
+
+* Python 2.x compatibility is currently broken in this fork
+* All tests in the testsuite now run on Python3. 
+
+To see this for yourself, in Ubuntu 14.04, you can do
+
+    git clone https://github.com/mikecroucher/GPy.git
+    cd GPy
+    git checkout devel
+    nosetests3 GPy/testing
+
+nosetests3 is Ubuntu's way of reffering to the Python 3 version of nosetests. You install it with 
+
+    sudo apt-get install python3-nose
+
+* Test coverage is less than 100% so it is expected that there is still more work to be done. We need more tests and examples to try out.
+* All weave functions not covered by the test suite are *simply commented out*. Can add equivalents later as test functions become available
+* A set of benchmarks would be useful! 
+
 ### Citation
 
     @Misc{gpy2014,
@@ -105,14 +125,12 @@ Ensure nose is installed via pip:
 
 Run nosetests from the root directory of the repository:
 
-    nosetests -v
+    nosetests -v GPy/testing
 
 or from within IPython
 
     import GPy; GPy.tests()
-
-
-
+		    
 ## Funding Acknowledgements