diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 025867a9..f7c0fd67 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -69,39 +69,39 @@ def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False
     part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
     return kern(input_dim, [part])
 
-# def gibbs(input_dim,variance=1., mapping=None):
-#     """
-#     Gibbs and MacKay non-stationary covariance function.
+def gibbs(input_dim,variance=1., mapping=None):
+    """
+    Gibbs and MacKay non-stationary covariance function.
 
-#     .. math::
+    .. math::
        
-#        r = sqrt((x_i - x_j)'*(x_i - x_j))
+       r = sqrt((x_i - x_j)'*(x_i - x_j))
        
-#        k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
 
-#        Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
+       Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
 
-#        where :math:`l(x)` is a function giving the length scale as a function of space.
-#        This is the non stationary kernel proposed by Mark Gibbs in his 1997
-#         thesis. It is similar to an RBF but has a length scale that varies
-#         with input location. This leads to an additional term in front of
-#         the kernel.
+       where :math:`l(x)` is a function giving the length scale as a function of space.
+       This is the non stationary kernel proposed by Mark Gibbs in his 1997
+        thesis. It is similar to an RBF but has a length scale that varies
+        with input location. This leads to an additional term in front of
+        the kernel.
 
-#         The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
 
-#         :param input_dim: the number of input dimensions
-#         :type input_dim: int 
-#         :param variance: the variance :math:`\sigma^2`
-#         :type variance: float
-#         :param mapping: the mapping that gives the lengthscale across the input space.
-#         :type mapping: GPy.core.Mapping
-#         :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
-#         :type ARD: Boolean
-#         :rtype: Kernpart object
+        :param input_dim: the number of input dimensions
+        :type input_dim: int 
+        :param variance: the variance :math:`\sigma^2`
+        :type variance: float
+        :param mapping: the mapping that gives the lengthscale across the input space.
+        :type mapping: GPy.core.Mapping
+        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+        :type ARD: Boolean
+        :rtype: Kernpart object
 
-#     """
-#     part = parts.gibbs.Gibbs(input_dim,variance,mapping)
-#     return kern(input_dim, [part])
+    """
+    part = parts.gibbs.Gibbs(input_dim,variance,mapping)
+    return kern(input_dim, [part])
 
 def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False):
     """
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 29b0ea03..d3457c0e 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -547,10 +547,11 @@ class Kern_check_model(Model):
             kernel = GPy.kern.rbf(1)
         if X==None:
             X = np.random.randn(num_samples, kernel.input_dim)
-        if X2==None:
-            X2 = np.random.randn(num_samples2, kernel.input_dim)
         if dL_dK==None:
-            dL_dK = np.ones((X.shape[0], X2.shape[0]))
+            if X2==None:
+                dL_dK = np.ones((X.shape[0], X.shape[0]))
+            else:
+                dL_dK = np.ones((X.shape[0], X2.shape[0]))
         
         self.kernel=kernel
         self.X = X
@@ -641,3 +642,28 @@ class Kern_check_dKdiag_dX(Kern_check_model):
 
     def _set_params(self, x):
         self.X=x.reshape(self.X.shape)
+
+def kern_test(kern, X=None, X2=None, verbose=False):
+    """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set.
+
+    :param kern: the kernel to be tested.
+    :type kern: GPy.kern.Kernpart
+    :param X: X input values to test the covariance function.
+    :type X: ndarray
+    :param X2: X2 input values to test the covariance function.
+    :type X2: ndarray
+    """
+    if X==None:
+        X = np.random.randn(10, kern.input_dim)
+    if X2==None:
+        X2 = np.random.randn(20, kern.input_dim)
+    result = [Kern_check_model(kern, X=X).is_positive_definite(),
+              Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose),
+              Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose),
+             Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose),
+              Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose),
+              Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)]
+    # Need to check 
+    #Kern_check_dK_dX(kern, X, X2=None).checkgrad(verbose=verbose)]
+    # but currently I think these aren't implemented.
+    return np.all(result)
diff --git a/GPy/kern/parts/gibbs.py b/GPy/kern/parts/gibbs.py
new file mode 100644
index 00000000..7ddd64f4
--- /dev/null
+++ b/GPy/kern/parts/gibbs.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...core.mapping import Mapping
+import GPy
+
+class Gibbs(Kernpart):
+    """
+    Gibbs and MacKay non-stationary covariance function.
+
+    .. math::
+       
+       r = sqrt((x_i - x_j)'*(x_i - x_j))
+       
+       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+
+       Z = (2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')^{q/2}
+
+       where :math:`l(x)` is a function giving the length scale as a function of space and :math:`q` is the dimensionality of the input space.
+       This is the non stationary kernel proposed by Mark Gibbs in his 1997
+        thesis. It is similar to an RBF but has a length scale that varies
+        with input location. This leads to an additional term in front of
+        the kernel.
+
+        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+
+        :param input_dim: the number of input dimensions
+        :type input_dim: int 
+        :param variance: the variance :math:`\sigma^2`
+        :type variance: float
+        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+        :type mapping: GPy.core.Mapping
+        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+        :type ARD: Boolean
+        :rtype: Kernpart object
+
+    """
+
+    def __init__(self, input_dim, variance=1., mapping=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not mapping:
+            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
+        if not ARD:
+            self.num_params=1+mapping.num_params
+        else:
+            raise NotImplementedError
+
+        self.mapping = mapping
+        self.name='gibbs'
+        self._set_params(np.hstack((variance, self.mapping._get_params())))
+
+    def _get_params(self):
+        return np.hstack((self.variance, self.mapping._get_params()))
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.variance = x[0]
+        self.mapping._set_params(x[1:])
+
+    def _get_param_names(self):
+        return ['variance'] + self.mapping._get_param_names()
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        self._K_computations(X, X2)
+        target += self.variance*self._K_dvar
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        np.add(target, self.variance, target)
+
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        self._K_computations(X, X2)
+        self._dK_computations(dL_dK)
+        if X2==None:
+            gmapping = self.mapping.df_dtheta(2*self._dL_dl[:, None], X)
+        else:
+            gmapping = self.mapping.df_dtheta(self._dL_dl[:, None], X)
+            gmapping += self.mapping.df_dtheta(self._dL_dl_two[:, None], X2)
+
+        target+= np.hstack([(dL_dK*self._K_dvar).sum(), gmapping])
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X."""
+        # First account for gradients arising from presence of X in exponent.
+        self._K_computations(X, X2)
+        _K_dist = X[:, None, :] - X2[None, :, :]
+        dK_dX = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
+        target += np.sum(dK_dX*dL_dK.T[:, :, None], 0)
+        # Now account for gradients arising from presence of X in lengthscale.
+        self._dK_computations(dL_dK)
+        target += self.mapping.df_dX(self._dL_dl[:, None], X)
+    
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X."""
+        pass
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to parameters."""
+        pass
+
+    
+    def _K_computations(self, X, X2=None):
+        """Pre-computations for the covariance function (used both when computing the covariance and its gradients). Here self._dK_dvar and self._K_dist2 are updated."""
+        self._lengthscales=self.mapping.f(X)
+        self._lengthscales2=np.square(self._lengthscales)
+        if X2==None:
+            self._lengthscales_two = self._lengthscales
+            self._lengthscales_two2 = self._lengthscales2
+            Xsquare = np.square(X).sum(1)
+            self._K_dist2 = -2.*tdot(X) + Xsquare[:, None] + Xsquare[None, :]
+        else:
+            self._lengthscales_two = self.mapping.f(X2)
+            self._lengthscales_two2 = np.square(self._lengthscales_two)
+            self._K_dist2 = -2.*np.dot(X, X2.T) + np.square(X).sum(1)[:, None] + np.square(X2).sum(1)[None, :]
+        self._w2 = self._lengthscales2 + self._lengthscales_two2.T
+        prod_length = self._lengthscales*self._lengthscales_two.T
+        self._K_exponential = np.exp(-self._K_dist2/self._w2)
+        self._K_dvar = np.sign(prod_length)*(2*np.abs(prod_length)/self._w2)**(self.input_dim/2.)*np.exp(-self._K_dist2/self._w2)
+
+    def _dK_computations(self, dL_dK):
+        """Pre-computations for the gradients of the covaraince function. Here the gradient of the covariance with respect to all the individual lengthscales is computed.
+        :param dL_dK: the gradient of the objective with respect to the covariance function.
+        :type dL_dK: ndarray"""
+        
+        self._dL_dl = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales_two.T**4 - self._lengthscales**4) + 2*self._lengthscales2*self._K_dist2)/(self._w2*self._w2*self._lengthscales)).sum(1)
+        if self._lengthscales_two is self._lengthscales:
+            self._dL_dl_two = None
+        else:
+            self._dL_dl_two = (dL_dK*self.variance*self._K_dvar*(self.input_dim/2.*(self._lengthscales**4 - self._lengthscales_two.T**4 ) + 2*self._lengthscales_two2.T*self._K_dist2)/(self._w2*self._w2*self._lengthscales_two.T)).sum(0)
diff --git a/GPy/kern/parts/poly.py b/GPy/kern/parts/poly.py
index 542e20e0..b01f3a01 100644
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@@ -103,7 +103,7 @@ class POLY(Kernpart):
         """Derivative of the covariance matrix with respect to X"""
         self._K_computations(X, X2)
         arg = self._K_poly_arg
-        target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]* dL_dK[:, :, None]).sum(1)
+        target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
             
     def dKdiag_dX(self, dL_dKdiag, X, target):
         """Gradient of diagonal of covariance with respect to X"""
diff --git a/GPy/kern/parts/white.py b/GPy/kern/parts/white.py
index 637dc741..49200bd6 100644
--- a/GPy/kern/parts/white.py
+++ b/GPy/kern/parts/white.py
@@ -44,7 +44,7 @@ class White(Kernpart):
     def dKdiag_dtheta(self,dL_dKdiag,X,target):
         target += np.sum(dL_dKdiag)
 
-    def dK_dX(self,dL_dK,X,X2,target):
+    def dK_dX(self,dL_dK,X,X2,target):        
         pass
 
     def dKdiag_dX(self,dL_dKdiag,X,target):
diff --git a/GPy/mappings/kernel.py b/GPy/mappings/kernel.py
index 07ac3cf6..5c802c13 100644
--- a/GPy/mappings/kernel.py
+++ b/GPy/mappings/kernel.py
@@ -33,7 +33,7 @@ class Kernel(Mapping):
         self.A = np.array((self.num_data, self.output_dim))
         self.bias = np.array(self.output_dim)
         self.randomize()
-        
+        self.name = 'kernel'
     def _get_param_names(self):
         return sum([['A_%i_%i' % (n, d) for d in range(self.output_dim)] for n in range(self.num_data)], []) + ['bias_%i' % d for d in range(self.output_dim)]
 
diff --git a/GPy/mappings/linear.py b/GPy/mappings/linear.py
index 5f6aaa49..5846903d 100644
--- a/GPy/mappings/linear.py
+++ b/GPy/mappings/linear.py
@@ -20,6 +20,7 @@ class Linear(Mapping):
     """
 
     def __init__(self, input_dim=1, output_dim=1):
+        self.name = 'linear'
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
         self.num_params = self.output_dim*(self.input_dim + 1)
         self.W = np.array((self.input_dim, self.output_dim))
diff --git a/GPy/mappings/mlp.py b/GPy/mappings/mlp.py
index 61b2f80e..40ff3782 100644
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@@ -26,6 +26,7 @@ class MLP(Mapping):
 
     def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
         Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
+        self.name = 'mlp'
         if isinstance(hidden_dim, int):
             hidden_dim = [hidden_dim]
         self.hidden_dim = hidden_dim
diff --git a/GPy/notes.txt b/GPy/notes.txt
index c8c184a1..38e546d8 100644
--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@@ -59,10 +59,22 @@ Why do likelihoods still have YYT everywhere, didn't we agree to set observed da
 
 For some reason a stub of _get_param_names(self) wasn't available in the Parameterized base class. Have put it in (is this right?)
 
-Is there a quick FAQ or something on how to build the documentation? I did it once, but can't remember!
+Is there a quick FAQ or something on how to build the documentation? I did it once, but can't remember! Have started a FAQ.txt file where we can add this type of information. 
 
 Similar for the nosetests ... even ran them last week but can't remember the command!
 
 Now added Gaussian priors to GPLVM latent variables by default. When running the GPy.examples.dimensionality_reduction.stick() example the print out from print model has the same value for the prior+likelihood as for the prior.
 
-For the back constrained GP-LVM need priors to be on the Xs not on the model parameters (because they aren't parameters, they are constraints). Need to work out how to do this.
+For the back constrained GP-LVM need priors to be on the Xs not on the model parameters (because they aren't parameters, they are constraints). Need to work out how to do this, perhaps by creating the full GP-LVM model then constraining around it, rather than overriding inside the GP-LVM model.
+
+
+This code fails:
+
+kern = GPy.kern.rbf(2)
+GPy.kern.Kern_check_dK_dX(kern, X=np.random.randn(10, 2), X2=None).checkgrad(verbose=True)
+
+because X2 is now equal to X, so there is a factor of 2 missing. Does this every come up? Yes, in the GP-LVM, (gplvm.py, line 64) where it is called with a corrective factor of 2! And on line 241 of sparse_gp where it is also called with a corrective factor of 2! In original matlab GPLVM, didn't allow gradients with respect to X alone, and multiplied by 2 in base code, but then add diagonal across those elements. This is missing in the new code.
+
+
+In white.py, line 41, Need to check here if X and X2 refer to the same reference too ... becaue up the pipeline somewhere someone may have set X2=X when X2 arrived originally equal to None.
+
diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py
index c73644d3..65a8da77 100644
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@@ -26,6 +26,21 @@ class KernelTests(unittest.TestCase):
         self.assertTrue(GPy.kern.Kern_check_dKdiag_dtheta(kern).checkgrad(verbose=verbose))
         self.assertTrue(GPy.kern.Kern_check_dK_dX(kern).checkgrad(verbose=verbose))
 
+    def test_gibbskernel(self):
+        verbose = False
+        kern = GPy.kern.gibbs(5, mapping=GPy.mappings.Linear(5, 1))
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_mlpkernel(self):
+        verbose = False
+        kern = GPy.kern.mlp(5)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
+    def test_polykernel(self):
+        verbose = False
+        kern = GPy.kern.poly(5, degree=4)
+        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+
     def test_fixedkernel(self):
         """
         Fixed effect kernel test