diff --git a/.coveragerc b/.coveragerc
index f01350f9..512c99c4 100644
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,7 +2,7 @@
 [run]
 branch = True
 source = GPy
-omit = ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py
+omit = ./GPy/examples/*.py, ./GPy/testing/*.py, travis_tests.py, setup.py, ./GPy/__version__.py
 
 [report]
 # Regexes for lines to exclude from consideration
diff --git a/.travis.yml b/.travis.yml
index 71d7bda6..cfa0d351 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -63,6 +63,5 @@ deploy:
   on:
     tags: true
     branch: deploy
-  #condition: "$TRAVIS_OS_NAME" == "osx" || ( "$TRAVIS_OS_NAME" == "linux" && "$PYTHON_VERSION" == "2.7" )
   distributions: $DIST
   skip_cleanup: true
diff --git a/GPy/core/parameterization/priors.py b/GPy/core/parameterization/priors.py
index cb7699eb..c21f6bc5 100644
--- a/GPy/core/parameterization/priors.py
+++ b/GPy/core/parameterization/priors.py
@@ -773,7 +773,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
     def compute_cls(self, x):
         cls = {}
         # Appending each data point to its proper class
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in cls:
                 cls[class_label] = []
@@ -792,7 +792,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
     # Adding data points as tuple to the dictionary so that we can access indices
     def compute_indices(self, x):
         data_idx = {}
-        for j in xrange(self.datanum):
+        for j in range(self.datanum):
             class_label = self.get_class_label(self.lbl[j])
             if class_label not in data_idx:
                 data_idx[class_label] = []
@@ -811,7 +811,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
             else:
                 lst_idx = []
             # Here we put indices of each class in to the list called lst_idx_all
-            for m in xrange(len(data_idx[i])):
+            for m in range(len(data_idx[i])):
                 lst_idx.append(data_idx[i][m][0])
             lst_idx_all.append(lst_idx)
         return lst_idx_all
@@ -847,7 +847,7 @@ class DGPLVM_Lamda(Prior, Parameterized):
             # pdb.set_trace()
             # Calculating Bi
             B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
-        for k in xrange(self.datanum):
+        for k in range(self.datanum):
             for i in data_idx:
                 N_i = float(len(data_idx[i]))
                 if k in lst_idx_all[i]:
diff --git a/GPy/core/symbolic.py b/GPy/core/symbolic.py
index 4a9fcb76..c4261e24 100644
--- a/GPy/core/symbolic.py
+++ b/GPy/core/symbolic.py
@@ -111,8 +111,8 @@ class Symbolic_core():
                 #     rows = func['function'].shape[0]
                 #     cols = func['function'].shape[1]
                 #     self.expressions[key]['derivative'] = sym.zeros(rows, cols)
-                #     for i in xrange(rows):
-                #         for j in xrange(cols):
+                #     for i in range(rows):
+                #         for j in range(cols):
                 #             self.expressions[key]['derivative'][i, j] = extract_derivative(func['function'][i, j], derivative_arguments)
                 # else:
                     self.expressions[key]['derivative'] = extract_derivative(func['function'], derivative_arguments)
@@ -123,7 +123,7 @@ class Symbolic_core():
             val = 1.0
             # TODO: improve approach for initializing parameters.
             if parameters is not None:
-                if parameters.has_key(theta.name):
+                if theta.name in parameters:
                     val = parameters[theta.name]
             # Add parameter.
             
@@ -176,7 +176,7 @@ class Symbolic_core():
         return gradient
         
     def eval_gradients_X(self, function, partial, **kwargs):
-        if kwargs.has_key('X'):
+        if 'X' in kwargs:
             gradients_X = np.zeros_like(kwargs['X'])
         self.eval_update_cache(**kwargs)
         for i, theta in enumerate(self.variables['X']):
@@ -405,7 +405,7 @@ class Symbolic_core():
                 if var_name == var.name:
                     expr = expr.subs(var, sub)
                     break
-        for m, r in function_substitutes.iteritems():
+        for m, r in function_substitutes.items():
             expr = expr.replace(m, r)#normcdfln, lambda arg : sym.log(normcdf(arg)))
         return expr.simplify()
 
@@ -417,4 +417,4 @@ class Symbolic_core():
             else:
                 return x[0]
             
-        return sorted(var_dict.iteritems(), key=sort_key, reverse=reverse)
+        return sorted(var_dict.items(), key=sort_key, reverse=reverse)
diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py
index f1df3cf9..81e1b773 100644
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@@ -184,7 +184,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
         data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
         lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
             m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
-        raw_input('Press enter to finish')
+        input('Press enter to finish')
         plt.close(fig)
     return m
 
@@ -210,7 +210,7 @@ def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40
         data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0, :]))
         lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean.values[0:1, :],  # @UnusedVariable
             m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
-        raw_input('Press enter to finish')
+        input('Press enter to finish')
         plt.close(fig)
     return m
 
@@ -242,7 +242,7 @@ def _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim=False):
         fig.clf()
         ax = fig.add_subplot(2, 1, 1)
         labls = slist_names
-        for S, lab in itertools.izip(slist, labls):
+        for S, lab in zip(slist, labls):
             ax.plot(S, label=lab)
         ax.legend()
         for i, Y in enumerate(Ylist):
@@ -288,7 +288,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, plot_sim=False):
         fig.clf()
         ax = fig.add_subplot(2, 1, 1)
         labls = slist_names
-        for S, lab in itertools.izip(slist, labls):
+        for S, lab in zip(slist, labls):
             ax.plot(S, label=lab)
         ax.legend()
         for i, Y in enumerate(Ylist):
@@ -520,7 +520,7 @@ def brendan_faces(optimize=True, verbose=True, plot=True):
         y = m.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, order='F', invert=False, scale=False)
         lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
-        raw_input('Press enter to finish')
+        input('Press enter to finish')
 
     return m
 
@@ -542,7 +542,7 @@ def olivetti_faces(optimize=True, verbose=True, plot=True):
         y = m.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False)
         lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax)
-        raw_input('Press enter to finish')
+        input('Press enter to finish')
 
     return m
 
@@ -577,7 +577,7 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True):
         y = m.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
         lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax)
-        raw_input('Press enter to finish')
+        input('Press enter to finish')
         lvm_visualizer.close()
         data_show.close()
     return m
@@ -598,7 +598,7 @@ def bcgplvm_linear_stick(kernel=None, optimize=True, verbose=True, plot=True):
         y = m.likelihood.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
         GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-        raw_input('Press enter to finish')
+        input('Press enter to finish')
 
     return m
 
@@ -619,7 +619,7 @@ def bcgplvm_stick(kernel=None, optimize=True, verbose=True, plot=True):
         y = m.likelihood.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect'])
         GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
-        # raw_input('Press enter to finish')
+        # input('Press enter to finish')
 
     return m
 
@@ -669,7 +669,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True):
         fig.canvas.draw()
         # Canvas.show doesn't work on OSX.
         #fig.canvas.show()
-        raw_input('Press enter to finish')
+        input('Press enter to finish')
 
     return m
 
@@ -693,7 +693,7 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
         y = m.Y[0, :]
         data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel'])
         lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0].copy(), m, data_show, latent_axes=ax)
-        raw_input('Press enter to finish')
+        input('Press enter to finish')
         lvm_visualizer.close()
         data_show.close()
 
diff --git a/GPy/examples/state_space.py b/GPy/examples/state_space.py
index fdb7fdd5..5a213f45 100644
--- a/GPy/examples/state_space.py
+++ b/GPy/examples/state_space.py
@@ -10,17 +10,17 @@ Y = np.sin(X) + np.random.randn(*X.shape)*0.1
 kernel1 = GPy.kern.Matern32(X.shape[1])
 m1  = GPy.models.GPRegression(X,Y, kernel1)
 
-print m1
+print(m1)
 m1.optimize(optimizer='bfgs',messages=True)
 
-print m1
+print(m1)
 
 kernel2 = GPy.kern.sde_Matern32(X.shape[1])
 #m2  = SS_model.StateSpace(X,Y, kernel2)
 m2 = GPy.models.StateSpace(X,Y, kernel2)
-print m2
+print(m2)
 
 m2.optimize(optimizer='bfgs',messages=True)
 
-print m2
+print(m2)
 
diff --git a/GPy/kern/src/integral.py b/GPy/kern/src/integral.py
index 971a48a8..6febf203 100644
--- a/GPy/kern/src/integral.py
+++ b/GPy/kern/src/integral.py
@@ -1,5 +1,6 @@
 # Written by Mike Smith michaeltsmith.org.uk
 
+from __future__ import division
 import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
@@ -24,7 +25,7 @@ class Integral(Kern): #todo do I need to inherit from Stationary
         self.link_parameters(self.variances, self.lengthscale) #this just takes a list of parameters we need to optimise.
 
     def h(self, z):
-        return 0.5 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))
+        return 0.5 * z * np.sqrt(math.pi) * math.erf(z) + np.exp(-(z**2))        
 
     def dk_dl(self, t, tprime, l): #derivative of the kernel wrt lengthscale
         return l * ( self.h(t/l) - self.h((t - tprime)/l) + self.h(tprime/l) - 1)
@@ -39,10 +40,8 @@ class Integral(Kern): #todo do I need to inherit from Stationary
                     dK_dv[i,j] = self.k_xx(x[0],x2[0],self.lengthscale[0])  #the gradient wrt the variance is k_xx.
             self.lengthscale.gradient = np.sum(dK_dl * dL_dK)
             self.variances.gradient = np.sum(dK_dv * dL_dK)
-            #print "V%0.5f" % self.variances.gradient
-            #print "L%0.5f" % self.lengthscale.gradient
         else:     #we're finding dK_xf/Dtheta
-            print("NEED TO HANDLE TODO!")
+            raise NotImplementedError("Currently this function only handles finding the gradient of a single vector of inputs (X) not a pair of vectors (X and X2)")
 
     #useful little function to help calculate the covariances.
     def g(self,z):
@@ -71,7 +70,6 @@ class Integral(Kern): #todo do I need to inherit from Stationary
             for i,x in enumerate(X):
                 for j,x2 in enumerate(X2):
                     K_xf[i,j] = self.k_xf(x[0],x2[0],self.lengthscale[0])
-            #print self.variances[0]
             return K_xf * self.variances[0]
 
     def Kdiag(self, X):
diff --git a/GPy/kern/src/integral_limits.py b/GPy/kern/src/integral_limits.py
index 7006ee6f..10370328 100644
--- a/GPy/kern/src/integral_limits.py
+++ b/GPy/kern/src/integral_limits.py
@@ -1,17 +1,23 @@
 # Written by Mike Smith michaeltsmith.org.uk
 
+from __future__ import division
+import math
 import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
 from paramz.transformations import Logexp
-import math
 
-class Integral_Limits(Kern): #todo do I need to inherit from Stationary
+
+class Integral_Limits(Kern): 
     """
-    Integral kernel, can include limits on each integral value.
+    Integral kernel. This kernel allows 1d histogram or binned data to be modelled.
+    The outputs are the counts in each bin. The inputs (on two dimensions) are the start and end points of each bin.
+    The kernel's predictions are the latent function which might have generated those binned results.
     """
 
     def __init__(self, input_dim, variances=None, lengthscale=None, ARD=False, active_dims=None, name='integral'):
+        """
+        """
         super(Integral_Limits, self).__init__(input_dim, active_dims, name)
 
         if lengthscale is None:
@@ -39,10 +45,8 @@ class Integral_Limits(Kern): #todo do I need to inherit from Stationary
                     dK_dv[i,j] = self.k_xx(x[0],x2[0],x[1],x2[1],self.lengthscale[0])  #the gradient wrt the variance is k_xx.
             self.lengthscale.gradient = np.sum(dK_dl * dL_dK)
             self.variances.gradient = np.sum(dK_dv * dL_dK)
-            #print "V%0.5f" % self.variances.gradient
-            #print "L%0.5f" % self.lengthscale.gradient
         else:     #we're finding dK_xf/Dtheta
-            print("NEED TO HANDLE TODO!")
+            raise NotImplementedError("Currently this function only handles finding the gradient of a single vector of inputs (X) not a pair of vectors (X and X2)")
 
     #useful little function to help calculate the covariances.
     def g(self,z):
@@ -71,6 +75,22 @@ class Integral_Limits(Kern): #todo do I need to inherit from Stationary
         return 0.5 * np.sqrt(math.pi) * l * (math.erf((t-tprime)/l) + math.erf((tprime-s)/l))
 
     def K(self, X, X2=None):
+        """Note: We have a latent function and an output function. We want to be able to find:
+          - the covariance between values of the output function
+          - the covariance between values of the latent function
+          - the "cross covariance" between values of the output function and the latent function
+        This method is used by GPy to either get the covariance between the outputs (K_xx) or
+        is used to get the cross covariance (between the latent function and the outputs (K_xf).
+        We take advantage of the places where this function is used:
+         - if X2 is none, then we know that the items being compared (to get the covariance for)
+         are going to be both from the OUTPUT FUNCTION.
+         - if X2 is not none, then we know that the items being compared are from two different
+         sets (the OUTPUT FUNCTION and the LATENT FUNCTION).
+        
+        If we want the covariance between values of the LATENT FUNCTION, we take advantage of
+        the fact that we only need that when we do prediction, and this only calls Kdiag (not K).
+        So the covariance between LATENT FUNCTIONS is available from Kdiag.        
+        """
         if X2 is None:
             K_xx = np.zeros([X.shape[0],X.shape[0]])
             for i,x in enumerate(X):
@@ -85,8 +105,9 @@ class Integral_Limits(Kern): #todo do I need to inherit from Stationary
             return K_xf * self.variances[0]
 
     def Kdiag(self, X):
-        """I've used the fact that we call this method for K_ff when finding the covariance as a hack so
-        I know if I should return K_ff or K_xx. In this case we're returning K_ff!!
+        """I've used the fact that we call this method during prediction (instead of K). When we
+        do prediction we want to know the covariance between LATENT FUNCTIONS (K_ff) (as that's probably
+        what the user wants).
         $K_{ff}^{post} = K_{ff} - K_{fx} K_{xx}^{-1} K_{xf}$"""
         K_ff = np.zeros(X.shape[0])
         for i,x in enumerate(X):
diff --git a/GPy/kern/src/multidimensional_integral_limits.py b/GPy/kern/src/multidimensional_integral_limits.py
index 0f473742..8a07595b 100644
--- a/GPy/kern/src/multidimensional_integral_limits.py
+++ b/GPy/kern/src/multidimensional_integral_limits.py
@@ -1,5 +1,6 @@
 # Written by Mike Smith michaeltsmith.org.uk
 
+from __future__ import division
 import numpy as np
 from .kern import Kern
 from ...core.parameterization import Param
@@ -8,7 +9,11 @@ import math
 
 class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from Stationary
     """
-    Integral kernel, can include limits on each integral value.
+    Integral kernel, can include limits on each integral value. This kernel allows an n-dimensional
+    histogram or binned data to be modelled. The outputs are the counts in each bin. The inputs
+    are the start and end points of each bin: Pairs of inputs act as the limits on each bin. So
+    inputs 4 and 5 provide the start and end values of each bin in the 3rd dimension.
+    The kernel's predictions are the latent function which might have generated those binned results.    
     """
 
     def __init__(self, input_dim, variances=None, lengthscale=None, ARD=False, active_dims=None, name='integral'):
@@ -30,7 +35,6 @@ class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from St
         return l * ( self.h((t-sprime)/l) - self.h((t - tprime)/l) + self.h((tprime-s)/l) - self.h((s-sprime)/l))
 
     def update_gradients_full(self, dL_dK, X, X2=None):
-        #print self.variances
         if X2 is None:  #we're finding dK_xx/dTheta
             dK_dl_term = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])
             k_term = np.zeros([X.shape[0],X.shape[0],self.lengthscale.shape[0]])
@@ -47,14 +51,12 @@ class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from St
                 for jl, l in enumerate(self.lengthscale):
                     if jl!=il:
                         dK_dl *= k_term[:,:,jl]
-                        #dK_dl = np.dot(dK_dl,k_term[:,:,il])
-                        #print k_term[:,:,il]
                 self.lengthscale.gradient[il] = np.sum(dK_dl * dL_dK)
             dK_dv = self.calc_K_xx_wo_variance(X) #the gradient wrt the variance is k_xx.
             self.variances.gradient = np.sum(dK_dv * dL_dK)
         else:     #we're finding dK_xf/Dtheta
-            print("NEED TO HANDLE TODO!")
-        #print self.variances[0],self.lengthscale[0],self.lengthscale[1] #np.sum(dK_dv*dL_dK)
+            raise NotImplementedError("Currently this function only handles finding the gradient of a single vector of inputs (X) not a pair of vectors (X and X2)")
+
 
 
     #useful little function to help calculate the covariances.
@@ -94,12 +96,10 @@ class Multidimensional_Integral_Limits(Kern): #todo do I need to inherit from St
         return K_xx
 
     def K(self, X, X2=None):
-        if X2 is None:
-            #print "X x X"
+        if X2 is None: #X vs X
             K_xx = self.calc_K_xx_wo_variance(X)
             return K_xx * self.variances[0]
-        else:
-            #print "X x X2"
+        else: #X vs X2
             K_xf = np.ones([X.shape[0],X2.shape[0]])
             for i,x in enumerate(X):
                 for j,x2 in enumerate(X2):
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index 78f72d9d..c5b2094f 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -678,7 +678,7 @@ class Likelihood(Parameterized):
         burnin_cache = np.zeros(par_chains)
         burnin_cache[:] = starting_loc.flatten()
         burning_in = True
-        for i in xrange(burn_in+num_samples):
+        for i in range(burn_in+num_samples):
             next_ind = i-burn_in
             if burning_in:
                 old_y = burnin_cache
diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py
index c8ff1664..7d10bab6 100644
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@@ -291,12 +291,12 @@ class SSGPLVM(SparseGP_MPI):
         Xs[b>self.X.gamma.values] = 0
         
         invcov = (Xs[:,:,:,None]*Xs[:,:,None,:]).sum(1)/noise_var+np.eye(Q)
-        cov = np.array([pdinv(invcov[s_idx])[0] for s_idx in xrange(invcov.shape[0])])
+        cov = np.array([pdinv(invcov[s_idx])[0] for s_idx in range(invcov.shape[0])])
         Ws = np.empty((nSamples, Q, D))
         tmp = (np.transpose(Xs, (0,2,1)).reshape(nSamples*Q,N).dot(self.Y)).reshape(nSamples,Q,D)
         mean = (cov[:,:,:,None]*tmp[:,None,:,:]).sum(2)/noise_var
         zeros = np.zeros((Q,))
-        for s_idx in xrange(Xs.shape[0]):
+        for s_idx in range(Xs.shape[0]):
             Ws[s_idx] = (np.random.multivariate_normal(mean=zeros,cov=cov[s_idx],size=(D,))).T+mean[s_idx]
         
         if raw_samples:
diff --git a/GPy/models/ss_mrd.py b/GPy/models/ss_mrd.py
index d571a542..0aa472c7 100644
--- a/GPy/models/ss_mrd.py
+++ b/GPy/models/ss_mrd.py
@@ -25,7 +25,7 @@ class SSMRD(Model):
         self.X = NormalPosterior(means=X, variances=X_variance)
         
         if kernels is None:
-            kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in xrange(len(Ylist))]
+            kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in range(len(Ylist))]
         if Zs is None:
             Zs = [None]* len(Ylist)
         if likelihoods is None:
@@ -34,9 +34,9 @@ class SSMRD(Model):
             inference_methods = [None]* len(Ylist)
         
         if IBP:
-            self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in xrange(len(Ylist))]
+            self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in range(len(Ylist))]
         else:
-            self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in xrange(len(Ylist))]
+            self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in range(len(Ylist))]
         self.models = [SSGPLVM(y, input_dim, X=X.copy(), X_variance=X_variance.copy(), Gamma=Gammas[i], num_inducing=num_inducing,Z=Zs[i], learnPi=False, group_spike=group_spike,
                                kernel=kernels[i],inference_method=inference_methods[i],likelihood=likelihoods[i], variational_prior=self.var_priors[i], IBP=IBP, tau=None if taus is None else taus[i],
                                name='model_'+str(i), mpi_comm=mpi_comm, sharedX=True) for i,y in enumerate(Ylist)]
@@ -73,7 +73,7 @@ class SSMRD(Model):
         # Divide latent dimensions
         idx = np.empty((input_dim,),dtype=np.int)
         residue = (input_dim)%(len(Ylist))
-        for i in xrange(len(Ylist)):
+        for i in range(len(Ylist)):
             if i < residue:
                 size = input_dim/len(Ylist)+1
                 idx[i*size:(i+1)*size] = i
@@ -86,7 +86,7 @@ class SSMRD(Model):
                 X = np.empty((Ylist[0].shape[0],input_dim))
                 fracs = np.empty((input_dim,))
                 from ..util.initialization import initialize_latent
-                for i in xrange(len(Ylist)):
+                for i in range(len(Ylist)):
                     Y = Ylist[i]
                     dim = (idx==i).sum()
                     if dim>0:
diff --git a/GPy/models/state_space_main.py b/GPy/models/state_space_main.py
index 891c0326..d0406e96 100644
--- a/GPy/models/state_space_main.py
+++ b/GPy/models/state_space_main.py
@@ -13,7 +13,7 @@ import scipy as sp
 import scipy.linalg as linalg
 
 try:
-    import state_space_setup
+    from . import state_space_setup
     setup_available = True
 except ImportError as e:
     setup_available = False
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index 99951eb1..5bd86e76 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -193,7 +193,12 @@ def check_kernel_gradient_functions(kern, X=None, X2=None, output_ind=None, verb
 
     if verbose:
         print("Checking gradients of K(X, X2) wrt theta.")
-    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    try:
+        result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print(("update_gradients_full, with differing X and X2, not implemented for " + kern.name))
     if result and verbose:
         print("Check passed.")
     if not result:
@@ -416,6 +421,21 @@ class KernelGradientTestsContinuous(unittest.TestCase):
         k.randomize()
         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
 
+    def test_integral(self):
+        k = GPy.kern.Integral(1)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_multidimensional_integral_limits(self):
+        k = GPy.kern.Multidimensional_Integral_Limits(2)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
+    def test_integral_limits(self):
+        k = GPy.kern.Integral_Limits(2)
+        k.randomize()
+        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
+
     def test_Linear(self):
         k = GPy.kern.Linear(self.D)
         k.randomize()
diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py
index b89b3601..3c6241f3 100644
--- a/GPy/testing/util_tests.py
+++ b/GPy/testing/util_tests.py
@@ -96,3 +96,14 @@ class TestDebug(unittest.TestCase):
         self.assertTrue((2, np.median(X.mean.values[:,2])) in fixed)
         self.assertTrue(len([t for t in fixed if t[0] == 1]) == 0) # Unfixed input should not be in fixed
 
+    def test_subarray(self):
+        import GPy
+        X = np.zeros((3,6), dtype=bool)
+        X[[1,1,1],[0,4,5]] = 1
+        X[1:,[2,3]] = 1
+        d = GPy.util.subarray_and_sorting.common_subarrays(X,axis=1)
+        self.assertTrue(len(d) == 3)
+        X[:, d[tuple(X[:,0])]]
+        self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5])
+        self.assertTrue(d[tuple(X[:,1])] == [1])
+
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 68c1732f..2d1d3244 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -73,7 +73,7 @@ def prompt_user(prompt):
 
     try:
         print(prompt)
-        choice = raw_input().lower()
+        choice = input().lower()
         # would like to test for exception here, but not sure if we can do that without importing IPython
     except:
         print('Stdin is not implemented.')
@@ -96,16 +96,16 @@ def prompt_user(prompt):
 def data_available(dataset_name=None):
     """Check if the data set is available on the local machine already."""
     try:
-        from itertools import izip_longest
+        from itertools import zip_longest
     except ImportError:
-        from itertools import zip_longest as izip_longest
+        from itertools import zip_longest as zip_longest
     dr = data_resources[dataset_name]
     zip_urls = (dr['files'], )
     if 'save_names' in dr: zip_urls += (dr['save_names'], )
     else: zip_urls += ([],)
 
-    for file_list, save_list in izip_longest(*zip_urls, fillvalue=[]):
-        for f, s in izip_longest(file_list, save_list, fillvalue=None):
+    for file_list, save_list in zip_longest(*zip_urls, fillvalue=[]):
+        for f, s in zip_longest(file_list, save_list, fillvalue=None):
             if s is not None: f=s # If there is a save_name given, use that one
             if not os.path.exists(os.path.join(data_path, dataset_name, f)):
                 return False
@@ -138,7 +138,7 @@ def download_url(url, store_directory, save_name=None, messages=True, suffix='')
             raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
     with open(save_name, 'wb') as f:
         meta = response.info()
-        content_length_str = meta.getheaders("Content-Length")
+        content_length_str = meta.get("Content-Length")
         if content_length_str:
             file_size = int(content_length_str[0])
         else:
@@ -214,14 +214,14 @@ def download_data(dataset_name=None):
 
     zip_urls = (dr['urls'], dr['files'])
 
-    if dr.has_key('save_names'): zip_urls += (dr['save_names'], )
+    if 'save_names' in dr: zip_urls += (dr['save_names'], )
     else: zip_urls += ([],)
 
-    if dr.has_key('suffices'): zip_urls += (dr['suffices'], )
+    if 'suffices' in dr: zip_urls += (dr['suffices'], )
     else: zip_urls += ([],)
 
-    for url, files, save_names, suffices in itertools.izip_longest(*zip_urls, fillvalue=[]):
-        for f, save_name, suffix in itertools.izip_longest(files, save_names, suffices, fillvalue=None):
+    for url, files, save_names, suffices in itertools.zip_longest(*zip_urls, fillvalue=[]):
+        for f, save_name, suffix in itertools.zip_longest(files, save_names, suffices, fillvalue=None):
             download_url(os.path.join(url,f), dataset_name, save_name, suffix=suffix)
 
     return True
@@ -361,7 +361,7 @@ def football_data(season='1314', data_set='football_data'):
         return league_dict[string]
 
     def football2num(string):
-        if football_dict.has_key(string):
+        if string in football_dict:
             return football_dict[string]
         else:
             football_dict[string] = len(football_dict)+1
diff --git a/GPy/util/subarray_and_sorting.py b/GPy/util/subarray_and_sorting.py
index 0966084c..645e7f1e 100644
--- a/GPy/util/subarray_and_sorting.py
+++ b/GPy/util/subarray_and_sorting.py
@@ -50,7 +50,7 @@ def common_subarrays(X, axis=0):
     cnt = count()
     def accumulate(x, s, c):
         t = tuple(x)
-        col = c.next()
+        col = next(c)
         iadd(s[t], [col])
         return None
     if axis == 0: [accumulate(x, subarrays, cnt) for x in X]