Merging changed files.

2026-05-03 00:32:39 +02:00 · 2013-09-21 12:17:59 +01:00 · 2013-09-21 12:17:59 +01:00 · 4154a4afb6
commit 4154a4afb6
parent 94ddfa7973 a976d2b42b
40 changed files with 555 additions and 415 deletions
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -17,6 +17,7 @@ def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
    part = parts.rbf_inv.RBFInv(input_dim,variance,inv_lengthscale,ARD)
    return kern(input_dim, [part])
@ -33,6 +34,7 @@ def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
    part = parts.rbf.RBF(input_dim,variance,lengthscale,ARD)
    return kern(input_dim, [part])
@ -41,11 +43,13 @@ def linear(input_dim,variances=None,ARD=False):
    """
     Construct a linear kernel.

-     Arguments
-     ---------
-    input_dimD (int), obligatory
-     variances (np.ndarray)
-     ARD (boolean)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variances:
+    :type variances: np.ndarray
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+
    """
    part = parts.linear.Linear(input_dim,variances,ARD)
    return kern(input_dim, [part])
@ -64,39 +68,42 @@ def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False
    :type bias_variance: float
    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
    :type ARD: Boolean
+
    """
    part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
    return kern(input_dim, [part])

 def gibbs(input_dim,variance=1., mapping=None):
    """
+
    Gibbs and MacKay non-stationary covariance function.

    .. math::

-       r = sqrt((x_i - x_j)'*(x_i - x_j))
+       r = \\sqrt{((x_i - x_j)'*(x_i - x_j))}

-       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+       k(x_i, x_j) = \\sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))

-       Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
+       Z = \\sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}

-       where :math:`l(x)` is a function giving the length scale as a function of space.
-       This is the non stationary kernel proposed by Mark Gibbs in his 1997
-        thesis. It is similar to an RBF but has a length scale that varies
-        with input location. This leads to an additional term in front of
-        the kernel.
+    Where :math:`l(x)` is a function giving the length scale as a function of space.

-        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+    This is the non stationary kernel proposed by Mark Gibbs in his 1997
+    thesis. It is similar to an RBF but has a length scale that varies
+    with input location. This leads to an additional term in front of
+    the kernel.

-        :param input_dim: the number of input dimensions
-        :type input_dim: int
-        :param variance: the variance :math:`\sigma^2`
-        :type variance: float
-        :param mapping: the mapping that gives the lengthscale across the input space.
-        :type mapping: GPy.core.Mapping
-        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
-        :type ARD: Boolean
-        :rtype: Kernpart object
+    The parameters are :math:`\\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\\sigma^2`
+    :type variance: float
+    :param mapping: the mapping that gives the lengthscale across the input space.
+    :type mapping: GPy.core.Mapping
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object

    """
    part = parts.gibbs.Gibbs(input_dim,variance,mapping)
@ -124,6 +131,7 @@ def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2,
    :type degree: int
    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
    :type ARD: Boolean
+
    """
    part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD)
    return kern(input_dim, [part])
@ -132,10 +140,11 @@ def white(input_dim,variance=1.):
    """
     Construct a white kernel.

-     Arguments
-     ---------
-    input_dimD (int), obligatory
-     variance (float)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+
    """
    part = parts.white.White(input_dim,variance)
    return kern(input_dim, [part])
@ -180,6 +189,7 @@ def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
    part = parts.exponential.Exponential(input_dim,variance, lengthscale, ARD)
    return kern(input_dim, [part])
@ -196,6 +206,7 @@ def Matern32(input_dim,variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
    part = parts.Matern32.Matern32(input_dim,variance, lengthscale, ARD)
    return kern(input_dim, [part])
@ -212,6 +223,7 @@ def Matern52(input_dim, variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
+
    """
    part = parts.Matern52.Matern52(input_dim, variance, lengthscale, ARD)
    return kern(input_dim, [part])
@ -220,10 +232,11 @@ def bias(input_dim, variance=1.):
    """
     Construct a bias kernel.

-     Arguments
-     ---------
-     input_dim (int), obligatory
-     variance (float)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+
    """
    part = parts.bias.Bias(input_dim, variance)
    return kern(input_dim, [part])
@ -231,10 +244,15 @@ def bias(input_dim, variance=1.):
 def finite_dimensional(input_dim, F, G, variances=1., weights=None):
    """
    Construct a finite dimensional kernel.
-    input_dim: int - the number of input dimensions
-    F: np.array of functions with shape (n,) - the n basis functions
-    G: np.array with shape (n,n) - the Gram matrix associated to F
-    variances : np.ndarray with shape (n,)
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param F: np.array of functions with shape (n,) - the n basis functions
+    :type F: np.array
+    :param G: np.array with shape (n,n) - the Gram matrix associated to F
+    :type G: np.array
+    :param variances: np.ndarray with shape (n,)
+    :type: np.ndarray
    """
    part = parts.finite_dimensional.FiniteDimensional(input_dim, F, G, variances, weights)
    return kern(input_dim, [part])
@ -247,6 +265,7 @@ def spline(input_dim, variance=1.):
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
+
    """
    part = parts.spline.Spline(input_dim, variance)
    return kern(input_dim, [part])
@ -259,6 +278,7 @@ def Brownian(input_dim, variance=1.):
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
+
    """
    part = parts.Brownian.Brownian(input_dim, variance)
    return kern(input_dim, [part])
@ -312,6 +332,7 @@ def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 *
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
+
    """
    part = parts.periodic_exponential.PeriodicExponential(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
@ -330,6 +351,7 @@ def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
+
    """
    part = parts.periodic_Matern32.PeriodicMatern32(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
@ -348,6 +370,7 @@ def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
+
    """
    part = parts.periodic_Matern52.PeriodicMatern52(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
@ -361,6 +384,7 @@ def prod(k1,k2,tensor=False):
    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
    :type tensor: Boolean
    :rtype: kernel object
+
    """
    part = parts.prod.Prod(k1, k2, tensor)
    return kern(part.input_dim, [part])
@ -376,10 +400,12 @@ def symmetric(k):
 def coregionalize(output_dim,rank=1, W=None, kappa=None):
    """
    Coregionlization matrix B, of the form:
+
    .. math::
       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}

-    An intrinsic/linear coregionalization kernel of the form
+    An intrinsic/linear coregionalization kernel of the form:
+
    .. math::
       k_2(x, y)=\mathbf{B} k(x, y)

@ -449,7 +475,7 @@ def independent_outputs(k):

 def hierarchical(k):
    """
-    TODO THis can't be right! Construct a kernel with independent outputs from an existing kernel
+    TODO This can't be right! Construct a kernel with independent outputs from an existing kernel
    """
    # for sl in k.input_slices:
    #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
@ -467,7 +493,8 @@ def build_lcm(input_dim, output_dim, kernel_list = [], rank=1,W=None,kappa=None)
    :param rank: number tuples of the corregionalization parameters 'coregion_W'
    :type rank: integer

-    ..Note the kernels dimensionality is overwritten to fit input_dim
+    ..note the kernels dimensionality is overwritten to fit input_dim
+
    """

    for k in kernel_list:
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -78,13 +78,15 @@ class kern(Parameterized):


    def plot_ARD(self, fignum=None, ax=None, title='', legend=False):
-        """If an ARD kernel is present, it bar-plots the ARD parameters,
+        """If an ARD kernel is present, it bar-plots the ARD parameters.
+
        :param fignum: figure number of the plot
        :param ax: matplotlib axis to plot on
        :param title: 
            title of the plot, 
            pass '' to not print a title
            pass None for a generic title
+
        """
        if ax is None:
            fig = pb.figure(fignum)
@ -175,8 +177,10 @@ class kern(Parameterized):
    def add(self, other, tensor=False):
        """
        Add another kernel to this one. Both kernels are defined on the same _space_
+
        :param other: the other kernel to be added
        :type other: GPy.kern
+
        """
        if tensor:
            D = self.input_dim + other.input_dim
@ -218,11 +222,13 @@ class kern(Parameterized):

    def prod(self, other, tensor=False):
        """
-        multiply two kernels (either on the same space, or on the tensor product of the input space).
+        Multiply two kernels (either on the same space, or on the tensor product of the input space).
+
        :param other: the other kernel to be added
        :type other: GPy.kern
        :param tensor: whether or not to use the tensor space (default is false).
        :type tensor: bool 
+
        """
        K1 = self.copy()
        K2 = other.copy()
@ -321,6 +327,7 @@ class kern(Parameterized):
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)
+
        """
        assert X.shape[1] == self.input_dim
        target = np.zeros(self.num_params)
@ -340,6 +347,7 @@ class kern(Parameterized):
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)"""
+
        target = np.zeros_like(X)
        if X2 is None: 
            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
@ -413,6 +421,7 @@ class kern(Parameterized):
        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
+
        """
        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
@ -568,7 +577,7 @@ class Kern_check_model(Model):

    def is_positive_definite(self):
        v = np.linalg.eig(self.kernel.K(self.X))[0]
-        if any(v<0):
+        if any(v<-1e-6):
            return False
        else:
            return True
@ -657,6 +666,7 @@ def kern_test(kern, X=None, X2=None, verbose=False):
    :type X: ndarray
    :param X2: X2 input values to test the covariance function.
    :type X2: ndarray
+
    """
    pass_checks = True
    if X==None:
@ -683,7 +693,7 @@ def kern_test(kern, X=None, X2=None, verbose=False):
        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
        pass_checks = False
        return False
-    
+
    if verbose:
        print("Checking gradients of K(X, X2) wrt theta.")
    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
@ -694,7 +704,7 @@ def kern_test(kern, X=None, X2=None, verbose=False):
        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
        pass_checks = False
        return False
-    
+
    if verbose:
        print("Checking gradients of Kdiag(X) wrt theta.")
    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
@ -705,10 +715,15 @@ def kern_test(kern, X=None, X2=None, verbose=False):
        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
        pass_checks = False
        return False
-        
+
    if verbose:
        print("Checking gradients of K(X, X) wrt X.")
-    result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    try:
+        result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("dK_dX not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
@ -719,7 +734,12 @@ def kern_test(kern, X=None, X2=None, verbose=False):

    if verbose:
        print("Checking gradients of K(X, X2) wrt X.")
-    result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    try:
+        result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("dK_dX not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
@ -730,7 +750,12 @@ def kern_test(kern, X=None, X2=None, verbose=False):

    if verbose:
        print("Checking gradients of Kdiag(X) wrt X.")
-    result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
+    try:
+        result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
+    except NotImplementedError:
+        result=True
+        if verbose:
+            print("dK_dX not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
@ -738,5 +763,5 @@ def kern_test(kern, X=None, X2=None, verbose=False):
        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
        pass_checks = False
        return False
-    
+
    return pass_checks
--- a/GPy/kern/parts/coregionalize.py
+++ b/GPy/kern/parts/coregionalize.py
@ -11,12 +11,13 @@ class Coregionalize(Kernpart):
    """
    Covariance function for intrinsic/linear coregionalization models

-    This covariance has the form
+    This covariance has the form:
    .. math::
       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + \text{diag}(kappa)

-    An intrinsic/linear coregionalization covariance function of the form
+    An intrinsic/linear coregionalization covariance function of the form:
    .. math::
+
       k_2(x, y)=\mathbf{B} k(x, y)

    it is obtained as the tensor product between a covariance function
@ -31,7 +32,7 @@ class Coregionalize(Kernpart):
    :param kappa: a vector which allows the outputs to behave independently
    :type kappa: numpy array of dimensionality  (output_dim,)

-    .. Note: see coregionalization examples in GPy.examples.regression for some usage.
+    .. note: see coregionalization examples in GPy.examples.regression for some usage.
    """
    def __init__(self, output_dim, rank=1, W=None, kappa=None):
        self.input_dim = 1
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@ -10,9 +10,12 @@ import GPy

 class Hetero(Kernpart):
    """
-    TODO: Need to constrain the function outputs positive (still thinking of best way of doing this!!! Yes, intend to use transformations, but what's the *best* way). Currently just squaring output.
+    TODO: Need to constrain the function outputs
+    positive (still thinking of best way of doing this!!! Yes, intend to use
+    transformations, but what's the *best* way). Currently just squaring output.

-    Heteroschedastic noise which depends on input location. See, for example, this paper by Goldberg et al.
+    Heteroschedastic noise which depends on input location. See, for example,
+    this paper by Goldberg et al.

    .. math::

@ -20,15 +23,15 @@ class Hetero(Kernpart):

       where :math:`\sigma^2(x)` is a function giving the variance  as a function of input space and :math:`\delta_{i,j}` is the Kronecker delta function.

-        The parameters are the parameters of \sigma^2(x) which is a
-        function that can be specified by the user, by default an
-        multi-layer peceptron is used.
+    The parameters are the parameters of \sigma^2(x) which is a
+    function that can be specified by the user, by default an
+    multi-layer peceptron is used.

-        :param input_dim: the number of input dimensions
-        :type input_dim: int 
-        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
-        :type mapping: GPy.core.Mapping
-        :rtype: Kernpart object
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+    :type mapping: GPy.core.Mapping
+    :rtype: Kernpart object

    See this paper:

@ -36,7 +39,7 @@ class Hetero(Kernpart):
    C. M. (1998) Regression with Input-dependent Noise: a Gaussian
    Process Treatment In Advances in Neural Information Processing
    Systems, Volume 10, pp.  493-499. MIT Press
-    
+
    for a Gaussian process treatment of this problem.

    """
@ -47,7 +50,7 @@ class Hetero(Kernpart):
            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
        if not transform:
            transform = GPy.core.transformations.logexp()
-            
+
        self.transform = transform
        self.mapping = mapping
        self.name='hetero'
@ -66,7 +69,7 @@ class Hetero(Kernpart):

    def K(self, X, X2, target):
        """Return covariance between X and X2."""
-        if X2==None or X2 is X:
+        if (X2 is None) or (X2 is X):
            target[np.diag_indices_from(target)] += self._Kdiag(X)

    def Kdiag(self, X, target):
@ -76,26 +79,26 @@ class Hetero(Kernpart):
    def _Kdiag(self, X):
        """Helper function for computing the diagonal elements of the covariance."""
        return self.mapping.f(X).flatten()**2
-    
+
    def dK_dtheta(self, dL_dK, X, X2, target):
        """Derivative of the covariance with respect to the parameters."""
-        if X2==None or X2 is X:
+        if (X2 is None) or (X2 is X):
            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
            self.dKdiag_dtheta(dL_dKdiag, X, target)

    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to parameters."""
-        target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None], X)*self.mapping.f(X)
+        target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None]*self.mapping.f(X), X)

    def dK_dX(self, dL_dK, X, X2, target):
        """Derivative of the covariance matrix with respect to X."""
        if X2==None or X2 is X:
            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
            self.dKdiag_dX(dL_dKdiag, X, target)
-    
+
    def dKdiag_dX(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to X."""
        target += 2.*self.mapping.df_dX(dL_dKdiag[:, None], X)*self.mapping.f(X)


-    
+
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@ -58,6 +58,8 @@ class Kernpart(object):
        raise NotImplementedError
    def dK_dX(self, dL_dK, X, X2, target):
        raise NotImplementedError
+    def dKdiag_dX(self, dL_dK, X, target):
+        raise NotImplementedError



@ -97,6 +99,9 @@ class Kernpart_stationary(Kernpart):
        # wrt lengthscale is 0.
        target[0] += np.sum(dL_dKdiag)

+    def dKdiag_dX(self, dL_dK, X, target):
+        pass # true for all stationary kernels
+

 class Kernpart_inner(Kernpart):
    def __init__(self,input_dim):
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@ -7,11 +7,13 @@ four_over_tau = 2./np.pi

 class MLP(Kernpart):
    """
-    multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
+
+    Multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)

    .. math::

-       k(x,y) = \sigma^2 \frac{2}{\pi}  \text{asin} \left(\frac{\sigma_w^2 x^\top y+\sigma_b^2}{\sqrt{\sigma_w^2x^\top x + \sigma_b^2 + 1}\sqrt{\sigma_w^2 y^\top y \sigma_b^2 +1}} \right)
+          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y \\sigma_b^2 +1}} \\right )
+          

    :param input_dim: the number of input dimensions
    :type input_dim: int 
@ -24,6 +26,7 @@ class MLP(Kernpart):
    :type ARD: Boolean
    :rtype: Kernpart object

+
    """

    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=100., ARD=False):
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@ -7,22 +7,22 @@ four_over_tau = 2./np.pi

 class POLY(Kernpart):
    """
-    polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel,
-    .. math::
-    
-    k(x, y) = \sigma^2*(\sigma_w^2 x'y+\sigma_b^b)^d

-    The kernel parameters are \sigma^2 (variance), \sigma^2_w
-    (weight_variance), \sigma^2_b (bias_variance) and d
+    Polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel:
+
+    .. math::
+        k(x, y) = \sigma^2\*(\sigma_w^2 x'y+\sigma_b^b)^d
+
+    The kernel parameters are :math:`\sigma^2` (variance), :math:`\sigma^2_w`
+    (weight_variance), :math:`\sigma^2_b` (bias_variance) and d
    (degree). Only gradients of the first three are provided for
    kernel optimisation, it is assumed that polynomial degree would
    be set by hand.

    The kernel is not recommended as it is badly behaved when the
-    \sigma^2_w*x'*y + \sigma^2_b has a magnitude greater than one. For completeness
-    there will be an automatic relevance determination version of this
-    kernel provided (NOT YET IMPLEMENTED!).
-
+    :math:`\sigma^2_w\*x'\*y + \sigma^2_b` has a magnitude greater than one. For completeness
+    there is an automatic relevance determination version of this
+    kernel provided (NOTE YET IMPLEMENTED!).
    :param input_dim: the number of input dimensions
    :type input_dim: int 
    :param variance: the variance :math:`\sigma^2`
@ -32,7 +32,7 @@ class POLY(Kernpart):
    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
    :param degree: the degree of the polynomial.
    :type degree: int
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
    :type ARD: Boolean
    :rtype: Kernpart object