diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py
index e6296dab..c7849a46 100644
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@@ -114,7 +114,7 @@ def coregionalisation_sparse(max_iters=100):
     return m
 
 def epomeo_gpx(max_iters=100):
-    """Perform Gaussian process regression on the GPX data from the Mount Epomeo runs. Requires gpxpy to be installed on your system."""
+    """Perform Gaussian process regression on the latitude and longitude data from the Mount Epomeo runs. Requires gpxpy to be installed on your system to load in the data."""
     data = GPy.util.datasets.epomeo_gpx()
     num_data_list = []
     for Xpart in data['X']:
@@ -122,16 +122,16 @@ def epomeo_gpx(max_iters=100):
 
     num_data_array = np.array(num_data_list)
     num_data = num_data_array.sum()
-    Y = np.zeros((num_data, 3))
+    Y = np.zeros((num_data, 2))
     t = np.zeros((num_data, 2))
     start = 0
     for Xpart, index in zip(data['X'], range(len(data['X']))):
         end = start+Xpart.shape[0]
         t[start:end, :] = np.hstack((Xpart[:, 0:1],
                                     index*np.ones((Xpart.shape[0], 1))))
-        Y[start:end, :] = Xpart[:, 1:4]
+        Y[start:end, :] = Xpart[:, 1:3]
 
-    num_inducing = 40
+    num_inducing = 200
     Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None],
                    np.random.randint(0, 4, num_inducing)[:, None]))
 
@@ -139,12 +139,12 @@ def epomeo_gpx(max_iters=100):
     k2 = GPy.kern.coregionalise(output_dim=5, rank=5)
     k = k1**k2 
 
-    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z)
+    m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True)
     m.constrain_fixed('.*rbf_var', 1.)
     m.constrain_fixed('iip')
     m.constrain_bounded('noise_variance', 1e-3, 1e-1)
 #     m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs')
-#    m.optimize(max_iters=max_iters)
+    m.optimize(max_iters=max_iters,messages=True)
 
     return m
 
diff --git a/GPy/inference/scg.py b/GPy/inference/scg.py
index 7c8dda8d..f4c7c9c4 100644
--- a/GPy/inference/scg.py
+++ b/GPy/inference/scg.py
@@ -61,6 +61,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True,
     function_eval = 1
     fnow = fold
     gradnew = gradf(x, *optargs) # Initial gradient.
+    if any(np.isnan(gradnew)):
+        raise UnexpectedInfOrNan
     current_grad = np.dot(gradnew, gradnew)
     gradold = gradnew.copy()
     d = -gradnew # Initial search direction.
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 0919d891..29b0ea03 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -619,3 +619,25 @@ class Kern_check_dK_dX(Kern_check_model):
 
     def _set_params(self, x):
         self.X=x.reshape(self.X.shape)
+
+class Kern_check_dKdiag_dX(Kern_check_model):
+    """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """
+    def __init__(self, kernel=None, dL_dK=None, X=None, X2=None):
+        Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None)
+        if dL_dK==None:
+            self.dL_dK = np.ones((self.X.shape[0]))
+
+    def log_likelihood(self):
+        return (self.dL_dK*self.kernel.Kdiag(self.X)).sum()
+
+    def _log_likelihood_gradients(self):
+        return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten()
+
+    def _get_param_names(self):
+        return ['X_'  +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])]
+                
+    def _get_params(self):
+        return self.X.flatten()
+
+    def _set_params(self, x):
+        self.X=x.reshape(self.X.shape)
diff --git a/GPy/kern/parts/mlp.py b/GPy/kern/parts/mlp.py
index 8f2d8fcd..41eca708 100644
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@@ -107,12 +107,22 @@ class MLP(Kernpart):
 
     def dK_dX(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X"""
+        self._K_computations(X, X2)
+        arg = self._K_asin_arg
+        post_div = np.sqrt(1-arg*arg)
+        numer = self._K_numer
+        denom = self._K_denom
+        vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
+        denom3 = denom*denom*denom
+        target += (((X2[None,:, :]/denom[None, :, None]+vec2[None, None, :]*X[:, :, None]*numer/denom)/post_div[:, :, None]) * dL_dK[:, :, None]).sum(1)
+        target *= four_over_tau*self.weight_variance*self.variance
         raise NotImplementedError
-        # self._K_computations(X, X2)
-        # gX = np.zeros((X2.shape[0], X.shape[1], X.shape[0]))
+
+
+        gX = np.zeros((X2.shape[0], X.shape[1], X.shape[0]))
         
-        # for i in range(X.shape[0]):
-        #     gX[:, :, i] = self._dK_dX_point(dL_dK, X, X2, target, i)
+        for i in range(X.shape[0]):
+            gX[:, :, i] = self._dK_dX_point(dL_dK, X, X2, target, i)
             
             
     def _dK_dX_point(self, dL_dK, X, X2, target, i):
@@ -130,9 +140,8 @@ class MLP(Kernpart):
         denom3 = denom*denom*denom
         gX = np.zeros((X2.shape[0], X2.shape[1]))
         for j in range(X2.shape[1]):
-            gX[:, j] =X2[:, j]/denom - vec2*X[i, j]*numer/denom3
+            gX[:, j] = X2[:, j]/denom - vec2*X[i, j]*numer/denom3
             gX[:, j] = four_over_tau*self.weight_variance*self.variance*gX[:, j]/np.sqrt(1-arg*arg)
-            target[i, :]
     
     
     def _K_computations(self, X, X2):
diff --git a/GPy/kern/parts/poly.py b/GPy/kern/parts/poly.py
index b1c2c09b..783f8386 100644
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@@ -101,11 +101,15 @@ class POLY(Kernpart):
 
     def dK_dX(self, dL_dK, X, X2, target):
         """Derivative of the covariance matrix with respect to X"""
-        pass
+        self._K_computations(X, X2)
+        arg = self._K_poly_arg
+        target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]* dL_dK[:, :, None]).sum(1)
             
-    def _dK_dX_point(self, dL_dK, X, X2, target, i):
-        """Gradient with respect to one point of X"""
-        pass
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X"""
+        self._K_diag_computations(X)
+        arg = self._K_diag_poly_arg
+        target += 2.*self.weight_variance*self.degree*self.variance*X*dL_dKdiag[:, None]*(arg**(self.degree-1))[:, None]
     
     
     def _K_computations(self, X, X2):
diff --git a/GPy/kern/parts/prod.py b/GPy/kern/parts/prod.py
index d251a5aa..21fb2e7b 100644
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@@ -62,7 +62,7 @@ class Prod(Kernpart):
         return self._K2
     
     def dK_dtheta(self,dL_dK,X,X2,target):
-        """derivative of the covariance matrix with respect to the parameters."""
+        """Derivative of the covariance matrix with respect to the parameters."""
         self._K_computations(X,X2)
         if X2 is None:
             self.k1.dK_dtheta(dL_dK*self._K2, X[:,self.slice1], None, target[:self.k1.num_params])
diff --git a/GPy/notes.txt b/GPy/notes.txt
index 36ca478f..52eb78d9 100644
--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@@ -1,15 +1,15 @@
-Prod.py kernel should take a list of kernels rather than two arguments for kernels.
-transformations.py should have limits on what is fed into exp() particularly for the negative log logistic. 
+Prod.py kernel could also take a list of kernels rather than two arguments for kernels.
+transformations.py should have limits on what is fed into exp() particularly for the negative log logistic (done -neil). 
 
 Load in a model with mlp kernel, plot it, change a parameter, plot it again. It doesn't update the plot.
 
 Tests for kernels which work directly on the kernel implementation (not through GP).
 
-Should stationary covariances have their own kernpart type?
+Should stationary covariances have their own kernpart type, I think so, also inner product kernels. That way the caching so carefully constructed for RBF or linear could be shared.
 
 Where do we declare default kernel parameters. In constructors.py or in the definition file for the kernel?
 
-When printing to stdout, can we check that our approach is also working nicely for the ipython notebook? I like the way our optimization ticks over, but at the moment this doesn't seem to work in the ipython notebook, it would be nice if it did.
+When printing to stdout, can we check that our approach is also working nicely for the ipython notebook? I like the way our optimization ticks over, but at the moment this doesn't seem to work in the ipython notebook, it would be nice if it did. My problems may be due to using ipython 0.12, I've had a poke around at fixing this and I can't do it for 0.12.
 
 When we print a model should we also include information such as number of inputs and number of outputs?
 
@@ -37,3 +37,20 @@ When using bfgs in ipython notebook, text appears in the original console, not i
 In sparse GPs wouldn't it be clearer to call Z inducing?
 
 In coregionalisation matrix, setting the W to all ones will (surely?) ensure that symmetry isn't broken. Also, but allowing it to scale like that, the output variance increases as rank is increased (and if user sets rank to more than output dim they could get very different results).
+
+We are inconsistent about our use of ise and ize e.g. optimize and normalize_X, but coregionalise, we should choose one and stick to it. Suggest -ize.
+
+Exceptions: we need to provide a list of exceptions we throw and specify what is thrown where. 
+
+Why is it get_params() but it's getstate()? Should be get_state(). Why is it get_gradient instead of get_gradients? Need to be consistent!! Doesn't matter which way we choose as long as it's consistent.
+
+In likelihood Nparams should be num_params
+
+In likelihood N should be num_data
+
+The Gaussian target in likelihood should be F What is V doing here?
+
+Need to check for nan values in likelihoods. These should be treated as missing values. If the likelihood can't handle the missing value an error should be throw.
+
+
+Sometimes you want to print kernpart objects, for diagnosis etc. This isn't possible currently. 
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index be4d0538..fa7fd833 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -16,10 +16,9 @@ import datetime
 import sys, urllib
 def reporthook(a,b,c): 
     # ',' at the end of the line is important!
-    print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
+    #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
     #you can also use sys.stdout.write
-    #sys.stdout.write("\r% 3.1f%% of %d bytes" 
-    #                 % (min(100, float(a * b) / c * 100), c)
+    sys.stdout.write("\r% 3.1f%% of %d bytes" % (min(100, float(a * b) / c * 100), c))
     sys.stdout.flush()
      
 # Global variables
@@ -288,7 +287,7 @@ except ImportError:
     gpxpy_available = False
 
 if gpxpy_available:
-    def epomeo_gpx(data_set='epomeo_gpx'):
+    def epomeo_gpx(data_set='epomeo_gpx', sample_every=4):
         if not data_available(data_set):
             download_data(data_set)
         files = ['endomondo_1', 'endomondo_2', 'garmin_watch_via_endomondo','viewranger_phone', 'viewranger_tablet']
@@ -301,7 +300,7 @@ if gpxpy_available:
             segment = gpx.tracks[0].segments[0]
             points = [point for track in gpx.tracks for segment in track.segments for point in segment.points]
             data = [[(point.time-datetime.datetime(2013,8,21)).total_seconds(), point.latitude, point.longitude, point.elevation] for point in points]
-            X.append(np.asarray(data))
+            X.append(np.asarray(data)[::sample_every, :])
             gpx_file.close()        
         return data_details_return({'X' : X, 'info' : 'Data is an array containing time in seconds, latitude, longitude and elevation in that order.'}, data_set)