diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py
index 5c3e350f..ae04b2ee 100644
--- a/GPy/core/parameterization/index_operations.py
+++ b/GPy/core/parameterization/index_operations.py
@@ -5,7 +5,7 @@ import numpy
 from numpy.lib.function_base import vectorize
 from .lists_and_dicts import IntArrayDict
 from functools import reduce
-from transformations import Transformation
+from .transformations import Transformation
 
 def extract_properties_to_index(index, props):
     prop_index = dict()
diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py
index d2d06fe3..96810927 100644
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@@ -6,10 +6,10 @@ import numpy; np = numpy
 import itertools
 from re import compile, _pattern_type
 from .param import ParamConcatenation
-from parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
+from .parameter_core import HierarchyError, Parameterizable, adjust_name_for_printing
 
 import logging
-from index_operations import ParameterIndexOperationsView
+from .index_operations import ParameterIndexOperationsView
 logger = logging.getLogger("parameters changed meta")
 
 class ParametersChangedMeta(type):
diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index 3a213fcd..c748f0df 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -758,12 +758,12 @@ class DGPLVM_Lamda(Prior, Parameterized):
         self.sigma2 = sigma2
         # self.x = x
         self.lbl = lbl
-	self.lamda = lamda
+        self.lamda = lamda
         self.classnum = lbl.shape[1]
         self.datanum = lbl.shape[0]
         self.x_shape = x_shape
         self.dim = x_shape[1]
-	self.lamda = Param('lamda', np.diag(lamda))
+        self.lamda = Param('lamda', np.diag(lamda))
         self.link_parameter(self.lamda)
 
     def get_class_label(self, y):
@@ -789,7 +789,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
         M_i = np.zeros((self.classnum, self.dim))
         for i in cls:
             # Mean of each class
-	    class_i = cls[i]
+            class_i = cls[i]
             M_i[i] = np.mean(class_i, axis=0)
         return M_i
 
@@ -899,8 +899,8 @@ class DGPLVM_Lamda(Prior, Parameterized):
 	#!!!!!!!!!!!!!!!!!!!!!!!!!!!
 	#self.lamda.values[:] = self.lamda.values/self.lamda.values.sum()	
 
-	xprime = x.dot(np.diagflat(self.lamda))
-	x = xprime
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
 	# print x
         cls = self.compute_cls(x)
         M_0 = np.mean(x, axis=0)
@@ -910,14 +910,14 @@ class DGPLVM_Lamda(Prior, Parameterized):
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
         return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
 
     # This function calculates derivative of the log of prior function
     def lnpdf_grad(self, x):
         x = x.reshape(self.x_shape)
-	xprime = x.dot(np.diagflat(self.lamda))
-	x = xprime
+        xprime = x.dot(np.diagflat(self.lamda))
+        x = xprime
 	# print x
         cls = self.compute_cls(x)
         M_0 = np.mean(x, axis=0)
@@ -934,7 +934,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
         # Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
         #Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
         #Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
-	Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
+        Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
         Sb_inv_N_trans = np.transpose(Sb_inv_N)
         Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
         Sw_trans = np.transpose(Sw)
@@ -951,14 +951,14 @@ class DGPLVM_Lamda(Prior, Parameterized):
         # Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
         DPxprim_Dx = DPxprim_Dx.T
     
-    	DPxprim_Dlamda = DPx_Dx.dot(x)
+        DPxprim_Dlamda = DPx_Dx.dot(x)
 
     	# Because of the GPy we need to transpose our matrix so that it gets the same shape as out matrix (denominator layout!!!)
-    	DPxprim_Dlamda = DPxprim_Dlamda.T
+        DPxprim_Dlamda = DPxprim_Dlamda.T
 
-	self.lamda.gradient = np.diag(DPxprim_Dlamda)
+        self.lamda.gradient = np.diag(DPxprim_Dlamda)
 	# print DPxprim_Dx
-    	return DPxprim_Dx
+        return DPxprim_Dx
 
 
     # def frb(self, x):
@@ -1139,8 +1139,8 @@ class DGPLVM_T(Prior):
     # This function calculates log of our prior
     def lnpdf(self, x):
         x = x.reshape(self.x_shape)
-	xprim = x.dot(self.vec)
-	x = xprim
+        xprim = x.dot(self.vec)
+        x = xprim
 	# print x  
         cls = self.compute_cls(x)
         M_0 = np.mean(x, axis=0)
@@ -1156,11 +1156,11 @@ class DGPLVM_T(Prior):
 
     # This function calculates derivative of the log of prior function
     def lnpdf_grad(self, x):
-	x = x.reshape(self.x_shape)
-	xprim = x.dot(self.vec)
-	x = xprim 
+        x = x.reshape(self.x_shape)
+        xprim = x.dot(self.vec)
+        x = xprim 
 	# print x       
-	cls = self.compute_cls(x)
+        cls = self.compute_cls(x)
         M_0 = np.mean(x, axis=0)
         M_i = self.compute_Mi(cls)
         Sb = self.compute_Sb(cls, M_i, M_0)
diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py
index d53cb6c8..01a1f44b 100644
--- a/GPy/core/parameterization/transformations.py
+++ b/GPy/core/parameterization/transformations.py
@@ -35,12 +35,12 @@ class Transformation(object):
         """
         compute the log of the jacobian of f, evaluated at f(x)= model_param
         """
-	raise NotImplementedError
+        raise NotImplementedError
     def log_jacobian_grad(self, model_param):
         """
         compute the drivative of the log of the jacobian of f, evaluated at f(x)= model_param
         """
-	raise NotImplementedError
+        raise NotImplementedError
     def gradfactor(self, model_param, dL_dmodel_param):
         """ df(opt_param)_dopt_param evaluated at self.f(opt_param)=model_param, times the gradient dL_dmodel_param,
 
diff --git a/GPy/inference/optimization/optimization.py b/GPy/inference/optimization/optimization.py
index fd140688..48bdd809 100644
--- a/GPy/inference/optimization/optimization.py
+++ b/GPy/inference/optimization/optimization.py
@@ -142,7 +142,7 @@ class opt_lbfgsb(Optimizer):
 
         #a more helpful error message is available in opt_result in the Error case
         if opt_result[2]['warnflag']==2:
-            self.status = 'Error' + opt_result[2]['task']
+            self.status = 'Error' + str(opt_result[2]['task'])
 
 class opt_simplex(Optimizer):
     def __init__(self, *args, **kwargs):
diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py
index 2bd55617..f479c387 100644
--- a/GPy/kern/__init__.py
+++ b/GPy/kern/__init__.py
@@ -19,5 +19,5 @@ from ._src.splitKern import SplitKern,DEtime
 from ._src.splitKern import DEtime as DiffGenomeKern
 
 
-from _src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
+from ._src.basis_funcs import LinearSlopeBasisFuncKernel, BasisFuncKernel, ChangePointBasisFuncKernel, DomainKernel
 
diff --git a/GPy/kern/_src/coregionalize.py b/GPy/kern/_src/coregionalize.py
index 7d5f5a2b..3d71c99f 100644
--- a/GPy/kern/_src/coregionalize.py
+++ b/GPy/kern/_src/coregionalize.py
@@ -6,7 +6,7 @@ import numpy as np
 from ...core.parameterization import Param
 from ...core.parameterization.transformations import Logexp
 from ...util.config import config # for assesing whether to use cython
-import coregionalize_cython
+from . import coregionalize_cython
 
 class Coregionalize(Kern):
     """
diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py
index 6f8b7be1..9ebb5711 100644
--- a/GPy/kern/_src/independent_outputs.py
+++ b/GPy/kern/_src/independent_outputs.py
@@ -105,7 +105,7 @@ class IndependentOutputs(CombinationKernel):
         if X2 is None:
             # TODO: make use of index_to_slices
             # FIXME: Broken as X is already sliced out
-            print "Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!"
+            print("Warning, gradients_X may not be working, I believe X has already been sliced out by the slicer!")
             values = np.unique(X[:,self.index_dim])
             slices = [X[:,self.index_dim]==i for i in values]
             [target.__setitem__(s, kern.gradients_X(dL_dK[s,s],X[s],None))
diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py
index df064de7..e69e316b 100644
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@@ -13,7 +13,7 @@ from ...util.config import config # for assesing whether to use cython
 from ...util.caching import Cache_this
 
 try:
-    import stationary_cython
+    from . import stationary_cython
 except ImportError:
     print('warning in sationary: failed to import cython module: falling back to numpy')
     config.set('cython', 'working', 'false')
diff --git a/GPy/models/dpgplvm.py b/GPy/models/dpgplvm.py
index 1c5dccd0..7f947c53 100644
--- a/GPy/models/dpgplvm.py
+++ b/GPy/models/dpgplvm.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 from .. import kern
-from bayesian_gplvm import BayesianGPLVM
+from .bayesian_gplvm import BayesianGPLVM
 from ..core.parameterization.variational import NormalPosterior, NormalPrior
 
 class DPBayesianGPLVM(BayesianGPLVM):
@@ -15,5 +15,5 @@ class DPBayesianGPLVM(BayesianGPLVM):
                  name='bayesian gplvm', mpi_comm=None, normalizer=None,
                  missing_data=False, stochastic=False, batchsize=1):
         super(DPBayesianGPLVM,self).__init__(Y=Y, input_dim=input_dim, X=X, X_variance=X_variance, init=init, 					num_inducing=num_inducing, Z=Z, kernel=kernel, inference_method=inference_method, likelihood=likelihood, mpi_comm=mpi_comm, normalizer=normalizer, missing_data=missing_data, stochastic=stochastic, batchsize=batchsize, name='dp bayesian gplvm')
-	self.X.mean.set_prior(X_prior)
+        self.X.mean.set_prior(X_prior)
         self.link_parameter(X_prior)
diff --git a/GPy/models/gp_regression.py b/GPy/models/gp_regression.py
index 105a63e7..7266ae92 100644
--- a/GPy/models/gp_regression.py
+++ b/GPy/models/gp_regression.py
@@ -31,7 +31,7 @@ class GPRegression(GP):
         if kernel is None:
             kernel = kern.RBF(X.shape[1])
 	
-	likelihood = likelihoods.Gaussian(variance=noise_var)
+        likelihood = likelihoods.Gaussian(variance=noise_var)
 
         super(GPRegression, self).__init__(X, Y, kernel, likelihood, name='GP regression', Y_metadata=Y_metadata, normalizer=normalizer)
 
diff --git a/GPy/models/gradient_checker.py b/GPy/models/gradient_checker.py
index c2cde834..728f3e23 100644
--- a/GPy/models/gradient_checker.py
+++ b/GPy/models/gradient_checker.py
@@ -228,14 +228,14 @@ class HessianChecker(GradientChecker):
 
         if verbose:
             if block_indices:
-                print "\nBlock {}".format(block_indices)
+                print("\nBlock {}".format(block_indices))
             else:
-                print "\nAll blocks"
+                print("\nAll blocks")
 
             header = ['Checked', 'Max-Ratio', 'Min-Ratio', 'Min-Difference', 'Max-Difference']
             header_string = map(lambda x: ' | '.join(header), [header])
             separator = '-' * len(header_string[0])
-            print '\n'.join([header_string[0], separator])
+            print('\n'.join([header_string[0], separator]))
             min_r = '%.6f' % float(numpy.min(ratio))
             max_r = '%.6f' % float(numpy.max(ratio))
             max_d = '%.6f' % float(numpy.max(difference))
@@ -248,7 +248,7 @@ class HessianChecker(GradientChecker):
                 checked = "\033[91m  False \033[0m"
 
             grad_string = "{} | {}  | {} |    {}    |   {} ".format(checked, cols[0], cols[1], cols[2], cols[3])
-            print grad_string
+            print(grad_string)
 
             if plot:
                 import pylab as pb
@@ -348,7 +348,7 @@ class SkewChecker(HessianChecker):
             numeric_hess_partial = nd.Jacobian(self._df, vectorized=True)
             numeric_hess = numeric_hess_partial(x)
 
-            print "Done making numerical hessian"
+            print("Done making numerical hessian")
             if analytic_hess.dtype is np.dtype('object'):
                 #Blockify numeric_hess aswell
                 blocksizes, pagesizes = get_block_shapes_3d(analytic_hess)
@@ -365,7 +365,7 @@ class SkewChecker(HessianChecker):
                 #Unless super_plot is set, just plot the first one
                 p = True if (plot and block_ind == numeric_hess.shape[2]-1) or super_plot else False
                 if verbose:
-                    print "Checking derivative of hessian wrt parameter number {}".format(block_ind)
+                    print("Checking derivative of hessian wrt parameter number {}".format(block_ind))
                 check_passed[block_ind] = self.checkgrad_block(analytic_hess[:,:,block_ind], numeric_hess[:,:,block_ind], verbose=verbose, step=step, tolerance=tolerance, block_indices=block_indices, plot=p)
 
             current_index += current_size
diff --git a/GPy/testing/link_function_tests.py b/GPy/testing/link_function_tests.py
index fb8fba99..e8cbbf66 100644
--- a/GPy/testing/link_function_tests.py
+++ b/GPy/testing/link_function_tests.py
@@ -49,7 +49,7 @@ class LinkFunctionTests(np.testing.TestCase):
         self.assertTrue(grad3.checkgrad(verbose=True))
 
         if test_lim:
-            print "Testing limits"
+            print("Testing limits")
             #Remove some otherwise we are too close to the limit for gradcheck to work effectively
             lim_of_inf = lim_of_inf - 1e-4
             grad = GradientChecker(link_func.transf, link_func.dtransf_df, x0=lim_of_inf)
diff --git a/GPy/util/block_matrices.py b/GPy/util/block_matrices.py
index e1e04aaa..8c0de67d 100644
--- a/GPy/util/block_matrices.py
+++ b/GPy/util/block_matrices.py
@@ -100,10 +100,10 @@ def block_dot(A, B, diagonal=False):
         Dshape = D.shape
         if diagonal and (len(Cshape) == 1 or len(Dshape) == 1\
                 or C.shape[0] != C.shape[1] or D.shape[0] != D.shape[1]):
-            print "Broadcasting, C: {} D:{}".format(C.shape, D.shape)
+            print("Broadcasting, C: {} D:{}".format(C.shape, D.shape))
             return C*D
         else:
-            print "Dotting, C: {} C:{}".format(C.shape, D.shape)
+            print("Dotting, C: {} C:{}".format(C.shape, D.shape))
             return np.dot(C,D)
     dot = np.vectorize(f, otypes = [np.object])
     return dot(A,B)
diff --git a/GPy/util/choleskies.py b/GPy/util/choleskies.py
index a12631aa..f079cabd 100644
--- a/GPy/util/choleskies.py
+++ b/GPy/util/choleskies.py
@@ -5,7 +5,7 @@ import numpy as np
 from . import linalg
 from .config import config
 
-import choleskies_cython
+from . import choleskies_cython
 
 def safe_root(N):
     i = np.sqrt(N)
@@ -59,12 +59,12 @@ def _backprop_gradient_pure(dL, L):
     """
     dL_dK = np.tril(dL).copy()
     N = L.shape[0]
-    for k in xrange(N - 1, -1, -1):
-        for j in xrange(k + 1, N):
-            for i in xrange(j, N):
+    for k in range(N - 1, -1, -1):
+        for j in range(k + 1, N):
+            for i in range(j, N):
                 dL_dK[i, k] -= dL_dK[i, j] * L[j, k]
                 dL_dK[j, k] -= dL_dK[i, j] * L[i, k]
-        for j in xrange(k + 1, N):
+        for j in range(k + 1, N):
             dL_dK[j, k] /= L[k, k]
             dL_dK[k, k] -= L[j, k] * dL_dK[j, k]
         dL_dK[k, k] /= (2 * L[k, k])
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 634a1e0d..ed73d133 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -15,7 +15,7 @@ import warnings
 import os
 from .config import config
 import logging
-import linalg_cython
+from . import linalg_cython
 
 
 _scipyversion = np.float64((scipy.__version__).split('.')[:2])
diff --git a/README.md b/README.md
index 60dcbe24..07a65cc6 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,6 @@ Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GP
 ### Python 3 Compatibility
 Work is underway to make GPy run on Python 3.
 
-* Python 2.x compatibility is currently broken in this fork
 * All tests in the testsuite now run on Python3. 
 
 To see this for yourself, in Ubuntu 14.04, you can do
@@ -21,12 +20,17 @@ To see this for yourself, in Ubuntu 14.04, you can do
     git clone https://github.com/mikecroucher/GPy.git
     cd GPy
     git checkout devel
+    python3 setup.py build_ext --inplace
     nosetests3 GPy/testing
 
 nosetests3 is Ubuntu's way of reffering to the Python 3 version of nosetests. You install it with 
 
     sudo apt-get install python3-nose
 
+The command `python3 setup.py build_ext --inplace` builds the Cython extensions. IF it doesn't work, you may need to install this:
+
+    sudo apt-get install python3-dev
+
 * Test coverage is less than 100% so it is expected that there is still more work to be done. We need more tests and examples to try out.
 * All weave functions not covered by the test suite are *simply commented out*. Can add equivalents later as test functions become available
 * A set of benchmarks would be useful!