Merge branch 'devel' into params

Conflicts: GPy/core/transformations.py GPy/kern/parts/kernpart.py
2026-05-01 07:46:22 +02:00 · 2013-09-20 17:20:46 +01:00 · 2013-09-20 17:20:46 +01:00 · c2d217e72c
commit c2d217e72c
parent e5816e39dd dfb00f46b4
77 changed files with 3608 additions and 807 deletions
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -5,7 +5,6 @@ import numpy as np
 from kern import kern
 import parts

-
 def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
    """
    Construct an RBF kernel
@ -74,9 +73,9 @@ def gibbs(input_dim,variance=1., mapping=None):
    Gibbs and MacKay non-stationary covariance function.

    .. math::
-       
+
       r = sqrt((x_i - x_j)'*(x_i - x_j))
-       
+
       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))

       Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
@ -90,7 +89,7 @@ def gibbs(input_dim,variance=1., mapping=None):
        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.

        :param input_dim: the number of input dimensions
-        :type input_dim: int 
+        :type input_dim: int
        :param variance: the variance :math:`\sigma^2`
        :type variance: float
        :param mapping: the mapping that gives the lengthscale across the input space.
@ -103,6 +102,12 @@ def gibbs(input_dim,variance=1., mapping=None):
    part = parts.gibbs.Gibbs(input_dim,variance,mapping)
    return kern(input_dim, [part])

+def hetero(input_dim, mapping=None, transform=None):
+    """
+    """
+    part = parts.hetero.Hetero(input_dim,mapping,transform)
+    return kern(input_dim, [part])
+
 def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False):
    """
    Construct a polynomial kernel
@ -135,6 +140,7 @@ def white(input_dim,variance=1.):
    part = parts.white.White(input_dim,variance)
    return kern(input_dim, [part])

+
 def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
    """
    Construct an exponential kernel
@ -340,29 +346,30 @@ def symmetric(k):
    k_.parts = [symmetric.Symmetric(p) for p in k.parts]
    return k_

-def coregionalise(output_dim, rank=1, W=None, kappa=None):
+def coregionalize(num_outputs,W_columns=1, W=None, kappa=None):
    """
-        Coregionalisation kernel. 
-
-    Used for computing covariance functions of the form
-    .. math::
-       k_2(x, y)=\mathbf{B} k(x, y)
-    where
+    Coregionlization matrix B, of the form:
    .. math::
       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}

-    :param output_dim: the number of output dimensions
-    :type output_dim: int
-    :param rank: the rank of the coregionalisation matrix.
-    :type rank: int
-    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
-    :type W: ndarray
-    :param kappa: a diagonal term which allows the outputs to behave independently.
+    An intrinsic/linear coregionalization kernel of the form
+    .. math::
+       k_2(x, y)=\mathbf{B} k(x, y)
+
+    it is obtainded as the tensor product between a kernel k(x,y) and B.
+
+    :param num_outputs: the number of outputs to coregionalize
+    :type num_outputs: int
+    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type W_colunns: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (num_outputs,)
    :rtype: kernel object

-    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
    """
-    p = parts.coregionalise.Coregionalise(output_dim,rank,W,kappa)
+    p = parts.coregionalize.Coregionalize(num_outputs,W_columns,W,kappa)
    return kern(1,[p])


@ -421,3 +428,31 @@ def hierarchical(k):
    #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
    _parts = [parts.hierarchical.Hierarchical(k.parts)]
    return kern(k.input_dim+len(k.parts),_parts)
+
+def build_lcm(input_dim, num_outputs, kernel_list = [], W_columns=1,W=None,kappa=None):
+    """
+    Builds a kernel of a linear coregionalization model
+
+    :input_dim: Input dimensionality
+    :num_outputs: Number of outputs
+    :kernel_list: List of coregionalized kernels, each element in the list will be multiplied by a different corregionalization matrix
+    :type kernel_list: list of GPy kernels
+    :param W_columns: number tuples of the corregionalization parameters 'coregion_W'
+    :type W_columns: integer
+
+    ..Note the kernels dimensionality is overwritten to fit input_dim
+    """
+
+    for k in kernel_list:
+        if k.input_dim <> input_dim:
+            k.input_dim = input_dim
+            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
+
+    k_coreg = coregionalize(num_outputs,W_columns,W,kappa)
+    kernel = kernel_list[0]**k_coreg.copy()
+
+    for k in kernel_list[1:]:
+        k_coreg = coregionalize(num_outputs,W_columns,W,kappa)
+        kernel += k**k_coreg.copy()
+
+    return kernel
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -13,7 +13,9 @@ import GPy
 class kern(Parameterized):
    def __init__(self, input_dim, parts=[], input_slices=None):
        """
-        This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where.
+        This is the main kernel class for GPy. It handles multiple
+        (additive) kernel functions, and keeps track of various things
+        like which parameters live where.

        The technical code for kernels is divided into _parts_ (see
        e.g. rbf.py). This object contains a list of parts, which are
@ -34,6 +36,11 @@ class kern(Parameterized):

        self.input_dim = input_dim

+        part_names = [k.name for k in self.parts]
+        self.name=''
+        for name in part_names:
+            self.name += name + '+'
+        self.name = self.name[:-1]
        # deal with input_slices
        if input_slices is None:
            self.input_slices = [slice(None) for p in self.parts]
@ -334,10 +341,8 @@ class kern(Parameterized):
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)"""
-        if X2 is None:
-            X2 = X
        target = np.zeros_like(X)
-        if X2 is None:
+        if X2 is None: 
            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
        else:
            [p.dK_dX(dL_dK, X[:, i_s], X2[:, i_s], target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
@ -654,17 +659,85 @@ def kern_test(kern, X=None, X2=None, verbose=False):
    :param X2: X2 input values to test the covariance function.
    :type X2: ndarray
    """
+    pass_checks = True
    if X==None:
        X = np.random.randn(10, kern.input_dim)
    if X2==None:
        X2 = np.random.randn(20, kern.input_dim)
-    result = [Kern_check_model(kern, X=X).is_positive_definite(),
-              Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose),
-              Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose),
-             Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose),
-              Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose),
-              Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)]
-    # Need to check 
-    #Kern_check_dK_dX(kern, X, X2=None).checkgrad(verbose=verbose)]
-    # but currently I think these aren't implemented.
-    return np.all(result)
+    if verbose:
+        print("Checking covariance function is positive definite.")
+    result = Kern_check_model(kern, X=X).is_positive_definite()
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Positive definite check failed for " + kern.name + " covariance function.")
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X) wrt theta.")
+    result = Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+    
+    if verbose:
+        print("Checking gradients of K(X, X2) wrt theta.")
+    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+    
+    if verbose:
+        print("Checking gradients of Kdiag(X) wrt theta.")
+    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of Kdiag(X) wrt theta failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+        
+    if verbose:
+        print("Checking gradients of K(X, X) wrt X.")
+    result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of K(X, X2) wrt X.")
+    result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of K(X, X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+
+    if verbose:
+        print("Checking gradients of Kdiag(X) wrt X.")
+    result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
+    if result and verbose:
+        print("Check passed.")
+    if not result:
+        print("Gradient of Kdiag(X) wrt X failed for " + kern.name + " covariance function. Gradient values as follows:")
+        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
+        pass_checks = False
+        return False
+    
+    return pass_checks
--- a/GPy/kern/parts/Matern32.py
+++ b/GPy/kern/parts/Matern32.py
@ -98,9 +98,13 @@ class Matern32(Kernpart):

    def dK_dX(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
-        ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+        if X2 is None:
+            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X[None, :, :]) / self.lengthscale), -1))[:, :, None]
+            ddist_dX = 2*(X[:, None, :] - X[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
+
+        else:
+            dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
+            ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
        dK_dX = -np.transpose(3 * self.variance * dist * np.exp(-np.sqrt(3) * dist) * ddist_dX, (1, 0, 2))
        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)

--- a/GPy/kern/parts/Matern52.py
+++ b/GPy/kern/parts/Matern52.py
@ -98,9 +98,12 @@ class Matern52(Kernpart):

    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
-        ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
+        if X2 is None:
+            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X[None,:,:])/self.lengthscale),-1))[:,:,None]
+            ddist_dX = 2*(X[:,None,:]-X[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
+        else:
+            dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscale),-1))[:,:,None]
+            ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscale**2/np.where(dist!=0.,dist,np.inf)
        dK_dX = -  np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
        target += np.sum(dK_dX*dL_dK.T[:,:,None],0)

--- a/GPy/kern/parts/init.py
+++ b/GPy/kern/parts/init.py
@ -1,10 +1,12 @@
 import bias
 import Brownian
-import coregionalise
+import coregionalize
 import exponential
 import finite_dimensional
 import fixed
 import gibbs
+#import hetero #hetero.py is not commited: omitting for now. JH. 
+import hierarchical
 import independent_outputs
 import linear
 import Matern32
@ -19,8 +21,7 @@ import prod
 import rational_quadratic
 import rbfcos
 import rbf
+import rbf_inv
 import spline
 import symmetric
 import white
-import hierarchical
-import rbf_inv
--- a/GPy/kern/parts/coregionalize.py
+++ b/GPy/kern/parts/coregionalize.py
@ -7,44 +7,48 @@ from GPy.util.linalg import mdot, pdinv
 import pdb
 from scipy import weave

-class Coregionalise(Kernpart):
+class Coregionalize(Kernpart):
    """
-    Coregionalisation kernel. 
+    Covariance function for intrinsic/linear coregionalization models

-    Used for computing covariance functions of the form
+    This covariance has the form
    .. math::
-       k_2(x, y)=B k(x, y)
-    where
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
+
+    An intrinsic/linear coregionalization covariance function of the form
    .. math::
-       B = WW^\top + diag(kappa)
+       k_2(x, y)=\mathbf{B} k(x, y)

-    :param output_dim: the number of output dimensions
-    :type output_dim: int
-    :param rank: the rank of the coregionalisation matrix.
-    :type rank: int
-    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B.
-    :type W: ndarray
-    :param kappa: a diagonal term which allows the outputs to behave independently.
-    :rtype: kernel object
+    it is obtained as the tensor product between a covariance function
+    k(x,y) and B.

-    .. Note: see coregionalisation examples in GPy.examples.regression for some usage.
+    :param num_outputs: number of outputs to coregionalize
+    :type num_outputs: int
+    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :type W_colunns: int
+    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
+    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :param kappa: a vector which allows the outputs to behave independently
+    :type kappa: numpy array of dimensionality  (num_outputs,)
+
+    .. Note: see coregionalization examples in GPy.examples.regression for some usage.
    """
-    def __init__(self,output_dim,rank=1, W=None, kappa=None):
+    def __init__(self,num_outputs,W_columns=1, W=None, kappa=None):
        self.input_dim = 1
        self.name = 'coregion'
-        self.output_dim = output_dim
-        self.rank = rank
+        self.num_outputs = num_outputs
+        self.W_columns = W_columns
        if W is None:
-            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
+            self.W = 0.5*np.random.randn(self.num_outputs,self.W_columns)/np.sqrt(self.W_columns)
        else:
-            assert W.shape==(self.output_dim,self.rank)
+            assert W.shape==(self.num_outputs,self.W_columns)
            self.W = W
        if kappa is None:
-            kappa = 0.5*np.ones(self.output_dim)
+            kappa = 0.5*np.ones(self.num_outputs)
        else:
-            assert kappa.shape==(self.output_dim,)
+            assert kappa.shape==(self.num_outputs,)
        self.kappa = kappa
-        self.num_params = self.output_dim*(self.rank + 1)
+        self.num_params = self.num_outputs*(self.W_columns + 1)
        self._set_params(np.hstack([self.W.flatten(),self.kappa]))

    def _get_params(self):
@ -52,12 +56,12 @@ class Coregionalise(Kernpart):

    def _set_params(self,x):
        assert x.size == self.num_params
-        self.kappa = x[-self.output_dim:]
-        self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank)
+        self.kappa = x[-self.num_outputs:]
+        self.W = x[:-self.num_outputs].reshape(self.num_outputs,self.W_columns)
        self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)

    def _get_param_names(self):
-        return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)]
+        return sum([['W%i_%i'%(i,j) for j in range(self.W_columns)] for i in range(self.num_outputs)],[]) + ['kappa_%i'%i for i in range(self.num_outputs)]

    def K(self,index,index2,target):
        index = np.asarray(index,dtype=np.int)
@ -75,26 +79,26 @@ class Coregionalise(Kernpart):
        if index2 is None:
            code="""
            for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+output_dim*index[i]];
+              target[i+i*N] += B[index[i]+num_outputs*index[i]];
              for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+output_dim*index[j]];
+                  target[j+i*N] += B[index[i]+num_outputs*index[j]];
                  target[i+j*N] += target[j+i*N];
                }
              }
            """
-            N,B,output_dim = index.size, self.B, self.output_dim
-            weave.inline(code,['target','index','N','B','output_dim'])
+            N,B,num_outputs = index.size, self.B, self.num_outputs
+            weave.inline(code,['target','index','N','B','num_outputs'])
        else:
            index2 = np.asarray(index2,dtype=np.int)
            code="""
            for(int i=0;i<num_inducing; i++){
              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
+                  target[i+j*num_inducing] += B[num_outputs*index[j]+index2[i]];
                }
              }
            """
-            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
-            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])
+            N,num_inducing,B,num_outputs = index.size,index2.size, self.B, self.num_outputs
+            weave.inline(code,['target','index','index2','N','num_inducing','B','num_outputs'])


    def Kdiag(self,index,target):
@ -111,12 +115,12 @@ class Coregionalise(Kernpart):
        code="""
        for(int i=0; i<num_inducing; i++){
          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
+            dL_dK_small[index[j] + num_outputs*index2[i]] += dL_dK[i+j*num_inducing];
          }
        }
        """
-        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])
+        N, num_inducing, num_outputs = index.size, index2.size, self.num_outputs
+        weave.inline(code, ['N','num_inducing','num_outputs','dL_dK','dL_dK_small','index','index2'])

        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
@ -133,8 +137,8 @@ class Coregionalise(Kernpart):
        ii,jj = ii.T, jj.T

        dL_dK_small = np.zeros_like(self.B)
-        for i in range(self.output_dim):
-            for j in range(self.output_dim):
+        for i in range(self.num_outputs):
+            for j in range(self.num_outputs):
                tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
                dL_dK_small[i,j] = tmp

@ -146,15 +150,13 @@ class Coregionalise(Kernpart):

    def dKdiag_dtheta(self,dL_dKdiag,index,target):
        index = np.asarray(index,dtype=np.int).flatten()
-        dL_dKdiag_small = np.zeros(self.output_dim)
-        for i in range(self.output_dim):
+        dL_dKdiag_small = np.zeros(self.num_outputs)
+        for i in range(self.num_outputs):
            dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
        dW = 2.*self.W*dL_dKdiag_small[:,None]
        dkappa = dL_dKdiag_small
        target += np.hstack([dW.flatten(),dkappa])

    def dK_dX(self,dL_dK,X,X2,target):
+        #NOTE In this case, pass is equivalent to returning zero.
        pass
-
-
-
--- a/GPy/kern/parts/gibbs.py
+++ b/GPy/kern/parts/gibbs.py
@ -9,7 +9,7 @@ import GPy

 class Gibbs(Kernpart):
    """
-    Gibbs and MacKay non-stationary covariance function.
+    Gibbs non-stationary covariance function. 

    .. math::
       
@ -25,7 +25,10 @@ class Gibbs(Kernpart):
        with input location. This leads to an additional term in front of
        the kernel.

-        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+        The parameters are :math:`\sigma^2`, the process variance, and
+        the parameters of l(x) which is a function that can be
+        specified by the user, by default an multi-layer peceptron is
+        used.

        :param input_dim: the number of input dimensions
        :type input_dim: int 
@ -37,6 +40,15 @@ class Gibbs(Kernpart):
        :type ARD: Boolean
        :rtype: Kernpart object

+    See Mark Gibbs's thesis for more details: Gibbs,
+    M. N. (1997). Bayesian Gaussian Processes for Regression and
+    Classification. PhD thesis, Department of Physics, University of
+    Cambridge. Or also see Page 93 of Gaussian Processes for Machine
+    Learning by Rasmussen and Williams. Although note that we do not
+    constrain the lengthscale to be positive by default. This allows
+    anticorrelation to occur. The positive constraint can be included
+    by the user manually.
+
    """

    def __init__(self, input_dim, variance=1., mapping=None, ARD=False):
@ -89,12 +101,18 @@ class Gibbs(Kernpart):
        """Derivative of the covariance matrix with respect to X."""
        # First account for gradients arising from presence of X in exponent.
        self._K_computations(X, X2)
-        _K_dist = X[:, None, :] - X2[None, :, :]
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_co
        dK_dX = (-2.*self.variance)*np.transpose((self._K_dvar/self._w2)[:, :, None]*_K_dist, (1, 0, 2))
        target += np.sum(dK_dX*dL_dK.T[:, :, None], 0)
        # Now account for gradients arising from presence of X in lengthscale.
        self._dK_computations(dL_dK)
-        target += self.mapping.df_dX(self._dL_dl[:, None], X)
+        if X2 is None:
+            target += 2.*self.mapping.df_dX(self._dL_dl[:, None], X)
+        else:
+            target += self.mapping.df_dX(self._dL_dl[:, None], X)
    
    def dKdiag_dX(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to X."""
@ -102,7 +120,8 @@ class Gibbs(Kernpart):

    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to parameters."""
-        pass
+        target[0] += np.sum(dL_dKdiag)
+

    
    def _K_computations(self, X, X2=None):
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@ -0,0 +1,101 @@
+# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+from IPython.core.debugger import Tracer; debug_here=Tracer()
+from kernpart import Kernpart
+import numpy as np
+from ...util.linalg import tdot
+from ...core.mapping import Mapping
+import GPy
+
+class Hetero(Kernpart):
+    """
+    TODO: Need to constrain the function outputs positive (still thinking of best way of doing this!!! Yes, intend to use transformations, but what's the *best* way). Currently just squaring output.
+
+    Heteroschedastic noise which depends on input location. See, for example, this paper by Goldberg et al.
+
+    .. math::
+
+       k(x_i, x_j) = \delta_{i,j} \sigma^2(x_i)
+
+       where :math:`\sigma^2(x)` is a function giving the variance  as a function of input space and :math:`\delta_{i,j}` is the Kronecker delta function.
+
+        The parameters are the parameters of \sigma^2(x) which is a
+        function that can be specified by the user, by default an
+        multi-layer peceptron is used.
+
+        :param input_dim: the number of input dimensions
+        :type input_dim: int 
+        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+        :type mapping: GPy.core.Mapping
+        :rtype: Kernpart object
+
+    See this paper:
+
+    Goldberg, P. W.  Williams, C. K. I. and Bishop,
+    C. M. (1998) Regression with Input-dependent Noise: a Gaussian
+    Process Treatment In Advances in Neural Information Processing
+    Systems, Volume 10, pp.  493-499. MIT Press
+    
+    for a Gaussian process treatment of this problem.
+
+    """
+
+    def __init__(self, input_dim, mapping=None, transform=None):
+        self.input_dim = input_dim
+        if not mapping:
+            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
+        if not transform:
+            transform = GPy.core.transformations.logexp()
+            
+        self.transform = transform
+        self.mapping = mapping
+        self.name='hetero'
+        self.num_params=self.mapping.num_params
+        self._set_params(self.mapping._get_params())
+
+    def _get_params(self):
+        return self.mapping._get_params()
+
+    def _set_params(self, x):
+        assert x.size == (self.num_params)
+        self.mapping._set_params(x)
+
+    def _get_param_names(self):
+        return self.mapping._get_param_names()
+
+    def K(self, X, X2, target):
+        """Return covariance between X and X2."""
+        if X2==None or X2 is X:
+            target[np.diag_indices_from(target)] += self._Kdiag(X)
+
+    def Kdiag(self, X, target):
+        """Compute the diagonal of the covariance matrix for X."""
+        target+=self._Kdiag(X)
+
+    def _Kdiag(self, X):
+        """Helper function for computing the diagonal elements of the covariance."""
+        return self.mapping.f(X).flatten()**2
+    
+    def dK_dtheta(self, dL_dK, X, X2, target):
+        """Derivative of the covariance with respect to the parameters."""
+        if X2==None or X2 is X:
+            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
+            self.dKdiag_dtheta(dL_dKdiag, X, target)
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to parameters."""
+        target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None], X)*self.mapping.f(X)
+
+    def dK_dX(self, dL_dK, X, X2, target):
+        """Derivative of the covariance matrix with respect to X."""
+        if X2==None or X2 is X:
+            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
+            self.dKdiag_dX(dL_dKdiag, X, target)
+    
+    def dKdiag_dX(self, dL_dKdiag, X, target):
+        """Gradient of diagonal of covariance with respect to X."""
+        target += 2.*self.mapping.df_dX(dL_dKdiag[:, None], X)*self.mapping.f(X)
+
+
+    
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@ -1,6 +1,5 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-import numpy


 class Kernpart(object):
@ -60,6 +59,45 @@ class Kernpart(object):
    def dK_dX(self, dL_dK, X, X2, target):
        raise NotImplementedError

+
+
+class Kernpart_stationary(Kernpart):
+    def __init__(self, input_dim, lengthscale=None, ARD=False):
+        self.input_dim = input_dim
+        self.ARD = ARD
+        if not ARD:
+            self.num_params = 2
+            if lengthscale is not None:
+                self.lengthscale = np.asarray(lengthscale)
+                assert self.lengthscale.size == 1, "Only one lengthscale needed for non-ARD kernel"
+            else:
+                self.lengthscale = np.ones(1)
+        else:
+            self.num_params = self.input_dim + 1
+            if lengthscale is not None:
+                self.lengthscale = np.asarray(lengthscale)
+                assert self.lengthscale.size == self.input_dim, "bad number of lengthscales"
+            else:
+                self.lengthscale = np.ones(self.input_dim)
+
+        # initialize cache
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+
+    def _set_params(self, x):
+        self.lengthscale = x
+        self.lengthscale2 = np.square(self.lengthscale)
+        # reset cached results
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S
+
+
+    def dKdiag_dtheta(self, dL_dKdiag, X, target):
+        # For stationary covariances, derivative of diagonal elements
+        # wrt lengthscale is 0.
+        target[0] += np.sum(dL_dKdiag)
+
+
 class Kernpart_inner(Kernpart):
    def __init__(self,input_dim):
        """
@ -73,5 +111,5 @@ class Kernpart_inner(Kernpart):
        Kernpart.__init__(self, input_dim)

        # initialize cache
-        self._Z, self._mu, self._S = numpy.empty(shape=(3, 1))
-        self._X, self._X2, self._params = numpy.empty(shape=(3, 1))
+        self._Z, self._mu, self._S = np.empty(shape=(3, 1))
+        self._X, self._X2, self._params = np.empty(shape=(3, 1))
--- a/GPy/kern/parts/linear.py
+++ b/GPy/kern/parts/linear.py
@ -99,7 +99,10 @@ class Linear(Kernpart):
            target += tmp.sum()

    def dK_dX(self, dL_dK, X, X2, target):
-        target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+        if X2 is None:
+            target += 2*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
+        else:
+            target += (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)

    def dKdiag_dX(self,dL_dKdiag,X,target):
        target += 2.*self.variances*dL_dKdiag[:,None]*X
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@ -110,9 +110,13 @@ class MLP(Kernpart):
        arg = self._K_asin_arg
        numer = self._K_numer
        denom = self._K_denom
-        vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
        denom3 = denom*denom*denom
-        target += four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        if X2 is not None:
+            vec2 = (X2*X2).sum(1)*self.weight_variance+self.bias_variance + 1.
+            target += four_over_tau*self.weight_variance*self.variance*((X2[None, :, :]/denom[:, :, None] - vec2[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
+        else:
+            vec = (X*X).sum(1)*self.weight_variance+self.bias_variance + 1.
+            target += 2*four_over_tau*self.weight_variance*self.variance*((X[None, :, :]/denom[:, :, None] - vec[None, :, None]*X[:, None, :]*(numer/denom3)[:, :, None])*(dL_dK/np.sqrt(1-arg*arg))[:, :, None]).sum(1)
            
    def dKdiag_dX(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to X"""
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@ -103,7 +103,10 @@ class POLY(Kernpart):
        """Derivative of the covariance matrix with respect to X"""
        self._K_computations(X, X2)
        arg = self._K_poly_arg
-        target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+        if X2 is None:
+            target += 2*self.weight_variance*self.degree*self.variance*(((X[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
+        else:
+            target += self.weight_variance*self.degree*self.variance*(((X2[None,:, :])) *(arg**(self.degree-1))[:, :, None]*dL_dK[:, :, None]).sum(1)
            
    def dKdiag_dX(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to X"""
--- a/GPy/kern/parts/prod.py
+++ b/GPy/kern/parts/prod.py
@ -2,6 +2,7 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

 from kernpart import Kernpart
+from coregionalize import Coregionalize
 import numpy as np
 import hashlib

@ -18,7 +19,7 @@ class Prod(Kernpart):
    """
    def __init__(self,k1,k2,tensor=False):
        self.num_params = k1.num_params + k2.num_params
-        self.name = k1.name + '<times>' + k2.name
+        self.name = '['+k1.name + '**' + k2.name +']'
        self.k1 = k1
        self.k2 = k2
        if tensor:
@ -60,7 +61,7 @@ class Prod(Kernpart):
        """Compute the part of the kernel associated with k2."""
        self._K_computations(X, X2)
        return self._K2
-    
+
    def dK_dtheta(self,dL_dK,X,X2,target):
        """Derivative of the covariance matrix with respect to the parameters."""
        self._K_computations(X,X2)
@ -90,8 +91,18 @@ class Prod(Kernpart):
    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
        self._K_computations(X,X2)
-        self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
-        self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+        if X2 is None:
+            if not isinstance(self.k1,Coregionalize) and not isinstance(self.k2,Coregionalize):
+                self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], None, target[:,self.slice1])
+                self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], None, target[:,self.slice2])
+            else:#if isinstance(self.k1,Coregionalize) or isinstance(self.k2,Coregionalize):
+                #NOTE The indices column in the inputs makes the ki.dK_dX fail when passing None instead of X[:,self.slicei]
+                X2 = X
+                self.k1.dK_dX(2.*dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+                self.k2.dK_dX(2.*dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])
+        else:
+            self.k1.dK_dX(dL_dK*self._K2, X[:,self.slice1], X2[:,self.slice1], target[:,self.slice1])
+            self.k2.dK_dX(dL_dK*self._K1, X[:,self.slice2], X2[:,self.slice2], target[:,self.slice2])

    def dKdiag_dX(self, dL_dKdiag, X, target):
        K1 = np.zeros(X.shape[0])
--- a/GPy/kern/parts/rational_quadratic.py
+++ b/GPy/kern/parts/rational_quadratic.py
@ -57,7 +57,7 @@ class RationalQuadratic(Kernpart):
        dist2 = np.square((X-X2.T)/self.lengthscale)

        dvar = (1 + dist2/2.)**(-self.power)
-        dl = self.power * self.variance * dist2 * self.lengthscale**(-3) * (1 + dist2/2./self.power)**(-self.power-1)
+        dl = self.power * self.variance * dist2 / self.lengthscale * (1 + dist2/2.)**(-self.power-1)
        dp = - self.variance * np.log(1 + dist2/2.) * (1 + dist2/2.)**(-self.power)

        target[0] += np.sum(dvar*dL_dK)
@ -70,10 +70,12 @@ class RationalQuadratic(Kernpart):

    def dK_dX(self,dL_dK,X,X2,target):
        """derivative of the covariance matrix with respect to X."""
-        if X2 is None: X2 = X
-        dist2 = np.square((X-X2.T)/self.lengthscale)
-
-        dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        if X2 is None:
+            dist2 = np.square((X-X.T)/self.lengthscale)
+            dX = -2.*self.variance*self.power * (X-X.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
+        else:
+            dist2 = np.square((X-X2.T)/self.lengthscale)
+            dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 *  (1 + dist2/2./self.lengthscale)**(-self.power-1)
        target += np.sum(dL_dK*dX,1)[:,np.newaxis]

    def dKdiag_dX(self,dL_dKdiag,X,target):
--- a/GPy/kern/parts/rbf.py
+++ b/GPy/kern/parts/rbf.py
@ -138,7 +138,10 @@ class RBF(Kernpart):

    def dK_dX(self, dL_dK, X, X2, target):
        self._K_computations(X, X2)
-        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        if X2 is None:
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
        dK_dX = (-self.variance / self.lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)

--- a/GPy/kern/parts/rbf_inv.py
+++ b/GPy/kern/parts/rbf_inv.py
@ -133,7 +133,10 @@ class RBFInv(RBF):

    def dK_dX(self, dL_dK, X, X2, target):
        self._K_computations(X, X2)
-        _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
+        if X2 is None:            
+            _K_dist = 2*(X[:, None, :] - X[None, :, :])
+        else:
+            _K_dist = X[:, None, :] - X2[None, :, :] # don't cache this in _K_computations because it is high memory. If this function is being called, chances are we're not in the high memory arena.
        dK_dX = (-self.variance * self.inv_lengthscale2) * np.transpose(self._K_dvar[:, :, np.newaxis] * _K_dist, (1, 0, 2))
        target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)