diff --git a/GPy/core/parameterized.py b/GPy/core/parameterized.py
index aeb9ef85..de1adaf8 100644
--- a/GPy/core/parameterized.py
+++ b/GPy/core/parameterized.py
@@ -231,17 +231,19 @@ class Parameterized(object):
 
     def constrain_fixed(self, regexp, value=None):
         """
-        Arguments
-        ---------
+
         :param regexp: which parameters need to be fixed.
         :type regexp: ndarray(dtype=int) or regular expression object or string
         :param value: the vlaue to fix the parameters to. If the value is not specified,
                  the parameter is fixed to the current value
         :type value: float
-        Notes
-        -----
+
+        **Notes**
+
         Fixing a parameter which is tied to another, or constrained in some way will result in an error.
-        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes
+
+        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes.
+
         """
         matches = self.grep_param_names(regexp)
         overlap = set(matches).intersection(set(self.all_constrained_indices()))
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index cb96b478..fdbe9f9c 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -16,16 +16,17 @@ class SparseGP(GPBase):
     :type X: np.ndarray (num_data x input_dim)
     :param likelihood: a likelihood instance, containing the observed data
     :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
-    :param kernel : the kernel (covariance function). See link kernels
+    :param kernel: the kernel (covariance function). See link kernels
     :type kernel: a GPy.kern.kern instance
     :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
     :type X_variance: np.ndarray (num_data x input_dim) | None
     :param Z: inducing inputs (optional, see note)
     :type Z: np.ndarray (num_inducing x input_dim) | None
-    :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
     :type num_inducing: int
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
     :type normalize_(X|Y): bool
+
     """
 
     def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
@@ -306,10 +307,11 @@ class SparseGP(GPBase):
 
     def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
         """
+
         Predict the function(s) at the new point(s) Xnew.
 
-        Arguments
-        ---------
+        **Arguments**
+
         :param Xnew: The points at which to make a prediction
         :type Xnew: np.ndarray, Nnew x self.input_dim
         :param X_variance_new: The uncertainty in the prediction points
diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py
index b0175a39..b9101160 100644
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@@ -14,6 +14,7 @@ import sys
 
 class SVIGP(GPBase):
     """
+
     Stochastic Variational inference in a Gaussian Process
 
     :param X: inputs
@@ -22,25 +23,26 @@ class SVIGP(GPBase):
     :type Y: np.ndarray of observations (N x D)
     :param batchsize: the size of a h
 
-    Additional kwargs are used as for a sparse GP. They include
+    Additional kwargs are used as for a sparse GP. They include:
 
     :param q_u: canonical parameters of the distribution squasehd into a 1D array
     :type q_u: np.ndarray
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
     :type M: int
-    :param kernel : the kernel/covariance function. See link kernels
+    :param kernel: the kernel/covariance function. See link kernels
     :type kernel: a GPy kernel
     :param Z: inducing inputs (optional, see note)
     :type Z: np.ndarray (M x Q) | None
     :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
     :type X_uncertainty: np.ndarray (N x Q) | None
     :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
     :type M: int
-    :param beta: noise precision. TODO> ignore beta if doing EP
+    :param beta: noise precision. TODO: ignore beta if doing EP
     :type beta: float
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
     :type normalize_(X|Y): bool
+
     """
 
 
diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py
index 88582351..a6c32c79 100644
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@@ -11,13 +11,15 @@ import GPy
 
 default_seed = 10000
 def crescent_data(seed=default_seed, kernel=None): # FIXME
-    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+    """
+    Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
 
     :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
-    :param seed : seed value for data generation.
+    :param seed: seed value for data generation.
     :type seed: int
-    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :param inducing: number of inducing variables (only used for 'FITC' or 'DTC').
     :type inducing: int
+
     """
 
     data = GPy.util.datasets.crescent_data(seed=seed)
@@ -35,6 +37,7 @@ def crescent_data(seed=default_seed, kernel=None): # FIXME
 def oil(num_inducing=50, max_iters=100, kernel=None):
     """
     Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
+
     """
     data = GPy.util.datasets.oil()
     X = data['X']
@@ -64,8 +67,10 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
 def toy_linear_1d_classification(seed=default_seed):
     """
     Simple 1D classification example
-    :param seed : seed value for data generation (default is 4).
+
+    :param seed: seed value for data generation (default is 4).
     :type seed: int
+
     """
 
     data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
@@ -92,8 +97,10 @@ def toy_linear_1d_classification(seed=default_seed):
 def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
     """
     Sparse 1D classification example
-    :param seed : seed value for data generation (default is 4).
+
+    :param seed: seed value for data generation (default is 4).
     :type seed: int
+
     """
 
     data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
@@ -123,10 +130,11 @@ def sparse_crescent_data(num_inducing=10, seed=default_seed, kernel=None):
     Run a Gaussian process classification with DTC approxiamtion on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
 
     :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
-    :param seed : seed value for data generation.
+    :param seed: seed value for data generation.
     :type seed: int
-    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :param inducing: number of inducing variables (only used for 'FITC' or 'DTC').
     :type inducing: int
+
     """
 
     data = GPy.util.datasets.crescent_data(seed=seed)
@@ -147,10 +155,11 @@ def FITC_crescent_data(num_inducing=10, seed=default_seed):
     Run a Gaussian process classification with FITC approximation on the crescent data. The demonstration uses EP to approximate the likelihood.
 
     :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
-    :param seed : seed value for data generation.
+    :param seed: seed value for data generation.
     :type seed: int
-    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
+    :param inducing: number of inducing variables (only used for 'FITC' or 'DTC').
     :type num_inducing: int
+
     """
 
     data = GPy.util.datasets.crescent_data(seed=seed)
@@ -171,8 +180,10 @@ def FITC_crescent_data(num_inducing=10, seed=default_seed):
 def toy_heaviside(seed=default_seed):
     """
     Simple 1D classification example using a heavy side gp transformation
-    :param seed : seed value for data generation (default is 4).
+
+    :param seed: seed value for data generation (default is 4).
     :type seed: int
+
     """
 
     data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
diff --git a/GPy/inference/conjugate_gradient_descent.py b/GPy/inference/conjugate_gradient_descent.py
index 0f6603e5..9eabf5dd 100644
--- a/GPy/inference/conjugate_gradient_descent.py
+++ b/GPy/inference/conjugate_gradient_descent.py
@@ -233,7 +233,7 @@ class CGD(Async_Optimize):
         """
         opt_async(self, f, df, x0, callback, update_rule=FletcherReeves,
                messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
-               report_every=10, *args, **kwargs)
+               report_every=10, \*args, \*\*kwargs)
         
         callback gets called every `report_every` iterations
 
@@ -244,16 +244,14 @@ class CGD(Async_Optimize):
     
         f, and df will be called with
             
-            f(xi, *args, **kwargs)
-            df(xi, *args, **kwargs)
+            f(xi, \*args, \*\*kwargs)
+            df(xi, \*args, \*\*kwargs)
         
-        **returns**
-        -----------
+        **Returns:**
         
             Started `Process` object, optimizing asynchronously 
         
-        **calls** 
-        ---------
+        **Calls:** 
         
             callback(x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message)
         
@@ -265,7 +263,7 @@ class CGD(Async_Optimize):
         """
         opt(self, f, df, x0, callback=None, update_rule=FletcherReeves,
                messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
-               report_every=10, *args, **kwargs)
+               report_every=10, \*args, \*\*kwargs)
         
         Minimize f, calling callback every `report_every` iterations with following syntax:
         
@@ -276,11 +274,10 @@ class CGD(Async_Optimize):
     
         f, and df will be called with
             
-            f(xi, *args, **kwargs)
-            df(xi, *args, **kwargs)
+            f(xi, \*args, \*\*kwargs)
+            df(xi, \*args, \*\*kwargs)
                 
         **returns** 
-        ---------
         
             x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message
         
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 046b0205..28066413 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -17,6 +17,7 @@ def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
     :type lengthscale: float
     :param ARD: Auto Relevance Determination (one lengthscale per dimension)
     :type ARD: Boolean
+
     """
     part = parts.rbf_inv.RBFInv(input_dim,variance,inv_lengthscale,ARD)
     return kern(input_dim, [part])
@@ -33,6 +34,7 @@ def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
     :type lengthscale: float
     :param ARD: Auto Relevance Determination (one lengthscale per dimension)
     :type ARD: Boolean
+
     """
     part = parts.rbf.RBF(input_dim,variance,lengthscale,ARD)
     return kern(input_dim, [part])
@@ -41,11 +43,13 @@ def linear(input_dim,variances=None,ARD=False):
     """
      Construct a linear kernel.
 
-     Arguments
-     ---------
-    input_dimD (int), obligatory
-     variances (np.ndarray)
-     ARD (boolean)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variances:
+    :type variances: np.ndarray
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
+    :type ARD: Boolean
+
     """
     part = parts.linear.Linear(input_dim,variances,ARD)
     return kern(input_dim, [part])
@@ -64,12 +68,14 @@ def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False
     :type bias_variance: float
     :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
     :type ARD: Boolean
+
     """
     part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
     return kern(input_dim, [part])
 
 def gibbs(input_dim,variance=1., mapping=None):
     """
+
     Gibbs and MacKay non-stationary covariance function.
 
     .. math::
@@ -124,6 +130,7 @@ def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2,
     :type degree: int
     :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
     :type ARD: Boolean
+
     """
     part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD)
     return kern(input_dim, [part])
@@ -132,10 +139,11 @@ def white(input_dim,variance=1.):
     """
      Construct a white kernel.
 
-     Arguments
-     ---------
-    input_dimD (int), obligatory
-     variance (float)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+
     """
     part = parts.white.White(input_dim,variance)
     return kern(input_dim, [part])
@@ -153,6 +161,7 @@ def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
     :type lengthscale: float
     :param ARD: Auto Relevance Determination (one lengthscale per dimension)
     :type ARD: Boolean
+
     """
     part = parts.exponential.Exponential(input_dim,variance, lengthscale, ARD)
     return kern(input_dim, [part])
@@ -169,6 +178,7 @@ def Matern32(input_dim,variance=1., lengthscale=None, ARD=False):
     :type lengthscale: float
     :param ARD: Auto Relevance Determination (one lengthscale per dimension)
     :type ARD: Boolean
+
     """
     part = parts.Matern32.Matern32(input_dim,variance, lengthscale, ARD)
     return kern(input_dim, [part])
@@ -185,6 +195,7 @@ def Matern52(input_dim, variance=1., lengthscale=None, ARD=False):
     :type lengthscale: float
     :param ARD: Auto Relevance Determination (one lengthscale per dimension)
     :type ARD: Boolean
+
     """
     part = parts.Matern52.Matern52(input_dim, variance, lengthscale, ARD)
     return kern(input_dim, [part])
@@ -193,10 +204,11 @@ def bias(input_dim, variance=1.):
     """
      Construct a bias kernel.
 
-     Arguments
-     ---------
-     input_dim (int), obligatory
-     variance (float)
+    :param input_dim: dimensionality of the kernel, obligatory
+    :type input_dim: int
+    :param variance: the variance of the kernel
+    :type variance: float
+
     """
     part = parts.bias.Bias(input_dim, variance)
     return kern(input_dim, [part])
@@ -204,10 +216,15 @@ def bias(input_dim, variance=1.):
 def finite_dimensional(input_dim, F, G, variances=1., weights=None):
     """
     Construct a finite dimensional kernel.
-    input_dim: int - the number of input dimensions
-    F: np.array of functions with shape (n,) - the n basis functions
-    G: np.array with shape (n,n) - the Gram matrix associated to F
-    variances : np.ndarray with shape (n,)
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param F: np.array of functions with shape (n,) - the n basis functions
+    :type F: np.array
+    :param G: np.array with shape (n,n) - the Gram matrix associated to F
+    :type G: np.array
+    :param variances: np.ndarray with shape (n,)
+    :type: np.ndarray
     """
     part = parts.finite_dimensional.FiniteDimensional(input_dim, F, G, variances, weights)
     return kern(input_dim, [part])
@@ -220,6 +237,7 @@ def spline(input_dim, variance=1.):
     :type input_dim: int
     :param variance: the variance of the kernel
     :type variance: float
+
     """
     part = parts.spline.Spline(input_dim, variance)
     return kern(input_dim, [part])
@@ -232,6 +250,7 @@ def Brownian(input_dim, variance=1.):
     :type input_dim: int
     :param variance: the variance of the kernel
     :type variance: float
+
     """
     part = parts.Brownian.Brownian(input_dim, variance)
     return kern(input_dim, [part])
@@ -285,6 +304,7 @@ def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 *
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
+
     """
     part = parts.periodic_exponential.PeriodicExponential(input_dim, variance, lengthscale, period, n_freq, lower, upper)
     return kern(input_dim, [part])
@@ -303,6 +323,7 @@ def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi
      :type period: float
      :param n_freq: the number of frequencies considered for the periodic subspace
      :type n_freq: int
+
     """
     part = parts.periodic_Matern32.PeriodicMatern32(input_dim, variance, lengthscale, period, n_freq, lower, upper)
     return kern(input_dim, [part])
@@ -321,6 +342,7 @@ def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi
      :type period: float
      :param n_freq: the number of frequencies considered for the periodic subspace
      :type n_freq: int
+
     """
     part = parts.periodic_Matern52.PeriodicMatern52(input_dim, variance, lengthscale, period, n_freq, lower, upper)
     return kern(input_dim, [part])
@@ -334,6 +356,7 @@ def prod(k1,k2,tensor=False):
     :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
     :type tensor: Boolean
     :rtype: kernel object
+
     """
     part = parts.prod.Prod(k1, k2, tensor)
     return kern(part.input_dim, [part])
@@ -349,10 +372,12 @@ def symmetric(k):
 def coregionalize(num_outputs,W_columns=1, W=None, kappa=None):
     """
     Coregionlization matrix B, of the form:
+
     .. math::
        \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
 
-    An intrinsic/linear coregionalization kernel of the form
+    An intrinsic/linear coregionalization kernel of the form:
+
     .. math::
        k_2(x, y)=\mathbf{B} k(x, y)
 
@@ -422,7 +447,7 @@ def independent_outputs(k):
 
 def hierarchical(k):
     """
-    TODO THis can't be right! Construct a kernel with independent outputs from an existing kernel
+    TODO This can't be right! Construct a kernel with independent outputs from an existing kernel
     """
     # for sl in k.input_slices:
     #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
@@ -440,7 +465,8 @@ def build_lcm(input_dim, num_outputs, kernel_list = [], W_columns=1,W=None,kappa
     :param W_columns: number tuples of the corregionalization parameters 'coregion_W'
     :type W_columns: integer
 
-    ..Note the kernels dimensionality is overwritten to fit input_dim
+    ..note the kernels dimensionality is overwritten to fit input_dim
+
     """
 
     for k in kernel_list:
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index e7b3e30a..d8f77942 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -78,13 +78,15 @@ class kern(Parameterized):
 
 
     def plot_ARD(self, fignum=None, ax=None, title='', legend=False):
-        """If an ARD kernel is present, it bar-plots the ARD parameters,
+        """If an ARD kernel is present, it bar-plots the ARD parameters.
+
         :param fignum: figure number of the plot
         :param ax: matplotlib axis to plot on
         :param title: 
             title of the plot, 
             pass '' to not print a title
             pass None for a generic title
+
         """
         if ax is None:
             fig = pb.figure(fignum)
@@ -175,8 +177,10 @@ class kern(Parameterized):
     def add(self, other, tensor=False):
         """
         Add another kernel to this one. Both kernels are defined on the same _space_
+
         :param other: the other kernel to be added
         :type other: GPy.kern
+
         """
         if tensor:
             D = self.input_dim + other.input_dim
@@ -223,6 +227,7 @@ class kern(Parameterized):
         :type other: GPy.kern
         :param tensor: whether or not to use the tensor space (default is false).
         :type tensor: bool 
+
         """
         K1 = self.copy()
         K2 = other.copy()
@@ -321,6 +326,7 @@ class kern(Parameterized):
         :type X: np.ndarray (num_samples x input_dim)
         :param X2: Observed data inputs (optional, defaults to X)
         :type X2: np.ndarray (num_inducing x input_dim)
+
         """
         assert X.shape[1] == self.input_dim
         target = np.zeros(self.num_params)
@@ -340,6 +346,7 @@ class kern(Parameterized):
         :type X: np.ndarray (num_samples x input_dim)
         :param X2: Observed data inputs (optional, defaults to X)
         :type X2: np.ndarray (num_inducing x input_dim)"""
+
         target = np.zeros_like(X)
         if X2 is None: 
             [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
@@ -413,6 +420,7 @@ class kern(Parameterized):
         :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
         :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
         :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
+
         """
         target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
         [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
@@ -657,6 +665,7 @@ def kern_test(kern, X=None, X2=None, verbose=False):
     :type X: ndarray
     :param X2: X2 input values to test the covariance function.
     :type X2: ndarray
+
     """
     pass_checks = True
     if X==None:
diff --git a/GPy/kern/parts/coregionalize.py b/GPy/kern/parts/coregionalize.py
index 363d98c3..d8a5c058 100644
--- a/GPy/kern/parts/coregionalize.py
+++ b/GPy/kern/parts/coregionalize.py
@@ -11,12 +11,14 @@ class Coregionalize(Kernpart):
     """
     Covariance function for intrinsic/linear coregionalization models
 
-    This covariance has the form
+    This covariance has the form:
     .. math::
+
        \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
 
-    An intrinsic/linear coregionalization covariance function of the form
+    An intrinsic/linear coregionalization covariance function of the form:
     .. math::
+
        k_2(x, y)=\mathbf{B} k(x, y)
 
     it is obtained as the tensor product between a covariance function
@@ -31,7 +33,7 @@ class Coregionalize(Kernpart):
     :param kappa: a vector which allows the outputs to behave independently
     :type kappa: numpy array of dimensionality  (num_outputs,)
 
-    .. Note: see coregionalization examples in GPy.examples.regression for some usage.
+    .. note: see coregionalization examples in GPy.examples.regression for some usage.
     """
     def __init__(self,num_outputs,W_columns=1, W=None, kappa=None):
         self.input_dim = 1
diff --git a/GPy/kern/parts/mlp.py b/GPy/kern/parts/mlp.py
index f4825f3d..e68aaa72 100644
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@@ -7,11 +7,13 @@ four_over_tau = 2./np.pi
 
 class MLP(Kernpart):
     """
-    multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
+
+    Multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
 
     .. math::
 
-       k(x,y) = \sigma^2 \frac{2}{\pi}  \text{asin} \left(\frac{\sigma_w^2 x^\top y+\sigma_b^2}{\sqrt{\sigma_w^2x^\top x + \sigma_b^2 + 1}\sqrt{\sigma_w^2 y^\top y \sigma_b^2 +1}} \right)
+          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y \\sigma_b^2 +1}} \\right )
+          
 
     :param input_dim: the number of input dimensions
     :type input_dim: int 
@@ -24,6 +26,7 @@ class MLP(Kernpart):
     :type ARD: Boolean
     :rtype: Kernpart object
 
+
     """
 
     def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=100., ARD=False):
diff --git a/GPy/kern/parts/poly.py b/GPy/kern/parts/poly.py
index cdc65210..edb7ed10 100644
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@@ -7,19 +7,20 @@ four_over_tau = 2./np.pi
 
 class POLY(Kernpart):
     """
-    polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel,
-    .. math::
-    
-    k(x, y) = \sigma^2*(\sigma_w^2 x'y+\sigma_b^b)^d
 
-    The kernel parameters are \sigma^2 (variance), \sigma^2_w
-    (weight_variance), \sigma^2_b (bias_variance) and d
+    Polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel:
+
+    .. math::
+        k(x, y) = \sigma^2\*(\sigma_w^2 x'y+\sigma_b^b)^d
+
+    The kernel parameters are :math:`\sigma^2` (variance), :math:`\sigma^2_w`
+    (weight_variance), :math:`\sigma^2_b` (bias_variance) and d
     (degree). Only gradients of the first three are provided for
     kernel optimisation, it is assumed that polynomial degree would
     be set by hand.
 
     The kernel is not recommended as it is badly behaved when the
-    \sigma^2_w*x'*y + \sigma^2_b has a magnitude greater than one. For completeness
+    :math:`\sigma^2_w\*x'\*y + \sigma^2_b` has a magnitude greater than one. For completeness
     there is an automatic relevance determination version of this
     kernel provided.
 
@@ -32,7 +33,7 @@ class POLY(Kernpart):
     :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
     :param degree: the degree of the polynomial.
     :type degree: int
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
     :type ARD: Boolean
     :rtype: Kernpart object
 
diff --git a/GPy/likelihoods/noise_models/gp_transformations.py b/GPy/likelihoods/noise_models/gp_transformations.py
index ccf965d9..32a591e8 100644
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@@ -10,19 +10,23 @@ from GPy.util.univariate_Gaussian import std_norm_pdf,std_norm_cdf,inv_std_norm_
 
 class GPTransformation(object):
     """
+
     Link function class for doing non-Gaussian likelihoods approximation
 
     :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
+
+    .. note:: Y values allowed depend on the likelihood_function used
+
     """
     def __init__(self):
         pass
 
 class Identity(GPTransformation):
     """
-    $$
-    g(f) = f
-    $$
+    .. math::
+
+        g(f) = f
+
     """
     #def transf(self,mu):
     #    return mu
@@ -39,9 +43,10 @@ class Identity(GPTransformation):
 
 class Probit(GPTransformation):
     """
-    $$
-    g(f) = \\Phi^{-1} (mu)
-    $$
+    .. math::
+
+        g(f) = \\Phi^{-1} (mu)
+
     """
     #def transf(self,mu):
     #    return inv_std_norm_cdf(mu)
@@ -57,9 +62,9 @@ class Probit(GPTransformation):
 
 class Log(GPTransformation):
     """
-    $$
-    g(f) = \log(\mu)
-    $$
+    .. math::
+        g(f) = \\log(\\mu)
+
     """
     #def transf(self,mu):
     #    return np.log(mu)
@@ -75,9 +80,9 @@ class Log(GPTransformation):
 
 class Log_ex_1(GPTransformation):
     """
-    $$
-    g(f) = \log(\exp(\mu) - 1)
-    $$
+    .. math::
+        g(f) = \\log(\\exp(\\mu) - 1)
+
     """
     #def transf(self,mu):
     #    """
@@ -110,9 +115,11 @@ class Reciprocal(GPTransformation):
 
 class Heaviside(GPTransformation):
     """
-    $$
-    g(f) = I_{x \in A}
-    $$
+
+    .. math::
+
+        g(f) = I_{x \\in A}
+
     """
     def transf(self,f):
         #transformation goes here
diff --git a/GPy/likelihoods/noise_models/noise_distributions.py b/GPy/likelihoods/noise_models/noise_distributions.py
index 4fd9c97f..331c6990 100644
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@@ -16,7 +16,8 @@ class NoiseDistribution(object):
     Likelihood class for doing Expectation propagation
 
     :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the LikelihoodFunction used
+
+    .. note:: Y values allowed depend on the LikelihoodFunction used
     """
     def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
         #assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."#FIXME
@@ -51,6 +52,7 @@ class NoiseDistribution(object):
         In case it is needed, this function assess the output values or makes any pertinent transformation on them.
 
         :param Y: observed output (Nx1 numpy.darray)
+
         """
         return Y
 
@@ -62,18 +64,21 @@ class NoiseDistribution(object):
         :param obs: observed output
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
 
     def _nlog_product_scaled(self,gp,obs,mu,sigma):
         """
         Negative log-product between the cavity distribution and a likelihood factor.
-        ..Note:: The constant term in the Gaussian distribution is ignored.
+
+        .. note:: The constant term in the Gaussian distribution is ignored.
 
         :param gp: latent variable
         :param obs: observed output
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
 
@@ -85,6 +90,7 @@ class NoiseDistribution(object):
         :param obs: observed output
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
 
@@ -96,6 +102,7 @@ class NoiseDistribution(object):
         :param obs: observed output
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
 
@@ -106,6 +113,7 @@ class NoiseDistribution(object):
         :param obs: observed output
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma),disp=False)
 
@@ -122,6 +130,7 @@ class NoiseDistribution(object):
         :param obs: observed output
         :param tau: cavity distribution 1st natural parameter (precision)
         :param v: cavity distribution 2nd natural paramenter (mu*precision)
+
         """
         mu = v/tau
         mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
@@ -137,7 +146,8 @@ class NoiseDistribution(object):
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
 
-        ..Note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
+        .. note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
+
         """
         return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
 
@@ -148,6 +158,7 @@ class NoiseDistribution(object):
         :param gp: latent variable
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
 
@@ -158,6 +169,7 @@ class NoiseDistribution(object):
         :param gp: latent variable
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
 
@@ -169,7 +181,8 @@ class NoiseDistribution(object):
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
 
-        ..Note:: This function helps computing E(V(Y_star|f_star))
+        .. note:: This function helps computing E(V(Y_star|f_star))
+
         """
         return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
 
@@ -180,6 +193,7 @@ class NoiseDistribution(object):
         :param gp: latent variable
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
 
@@ -190,6 +204,7 @@ class NoiseDistribution(object):
         :param gp: latent variable
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
 
@@ -201,7 +216,8 @@ class NoiseDistribution(object):
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
 
-        ..Note:: This function helps computing E( E(Y_star|f_star)**2 )
+        .. note:: This function helps computing E( E(Y_star|f_star)**2 )
+
         """
         return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
 
@@ -212,6 +228,7 @@ class NoiseDistribution(object):
         :param gp: latent variable
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
 
@@ -222,6 +239,7 @@ class NoiseDistribution(object):
         :param gp: latent variable
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
 
@@ -243,6 +261,7 @@ class NoiseDistribution(object):
 
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma),disp=False)
         mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
@@ -266,6 +285,7 @@ class NoiseDistribution(object):
 
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
+
         """
         maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma),disp=False)
         mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
@@ -278,6 +298,7 @@ class NoiseDistribution(object):
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
         :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+
         """
         # E( V(Y_star|f_star) )
         maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma),disp=False)
@@ -310,6 +331,7 @@ class NoiseDistribution(object):
         :param mu: cavity distribution mean
         :param sigma: cavity distribution standard deviation
         :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
+
         """
         qf = stats.norm.ppf(p,mu,sigma)
         return self.gp_link.transf(qf)
@@ -321,6 +343,7 @@ class NoiseDistribution(object):
         :param x: tuple (latent variable,output)
         :param mu: latent variable's predictive mean
         :param sigma: latent variable's predictive standard deviation
+
         """
         return self._nlog_product_scaled(x[0],x[1],mu,sigma)
 
@@ -331,7 +354,9 @@ class NoiseDistribution(object):
         :param x: tuple (latent variable,output)
         :param mu: latent variable's predictive mean
         :param sigma: latent variable's predictive standard deviation
-        ..Note: Only avilable when the output is continuous
+
+        .. note: Only available when the output is continuous
+
         """
         assert not self.discrete, "Gradient not available for discrete outputs."
         return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
@@ -343,7 +368,9 @@ class NoiseDistribution(object):
         :param x: tuple (latent variable,output)
         :param mu: latent variable's predictive mean
         :param sigma: latent variable's predictive standard deviation
-        ..Note: Only avilable when the output is continuous
+
+        .. note: Only available when the output is continuous
+
         """
         assert not self.discrete, "Hessian not available for discrete outputs."
         cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
@@ -356,14 +383,17 @@ class NoiseDistribution(object):
         :param x: tuple (latent variable,output)
         :param mu: latent variable's predictive mean
         :param sigma: latent variable's predictive standard deviation
+
         """
         return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
 
     def predictive_values(self,mu,var):
         """
-        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
+        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
+
         :param mu: mean of the latent variable
         :param var: variance of the latent variable
+
         """
         if isinstance(mu,float) or isinstance(mu,int):
             mu = [mu]
diff --git a/GPy/likelihoods/noise_models/poisson_noise.py b/GPy/likelihoods/noise_models/poisson_noise.py
index e4ce90d3..f143c513 100644
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@@ -13,10 +13,10 @@ class Poisson(NoiseDistribution):
     """
     Poisson likelihood
     Y is expected to take values in {0,1,2,...}
-    -----
-    $$
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
-    $$
+
+    .. math::
+        L(x) = \\exp(\\lambda) * \\frac{\\lambda^Y_i}{Y_i!}
+
     """
     def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
         #self.discrete = True
diff --git a/GPy/mappings/mlp.py b/GPy/mappings/mlp.py
index 40ff3782..46dbc2a9 100644
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@@ -10,11 +10,13 @@ class MLP(Mapping):
 
     .. math::
 
-       f(\mathbf{x}*) = \mathbf{W}^0\boldsymbol{\phi}(\mathbf{W}^1\mathbf{x}+\mathb{b}^1)^* + \mathbf{b}^0
+       f(\\mathbf{x}*) = \\mathbf{W}^0\\boldsymbol{\\phi}(\\mathbf{W}^1\\mathbf{x}+\\mathbf{b}^1)^* + \\mathbf{b}^0
 
     where
-    ..math::
-      \phi(\cdot) = \text{tanh}(\cdot)
+
+    .. math::
+
+      \\phi(\\cdot) = \\text{tanh}(\\cdot)
 
     :param X: input observations
     :type X: ndarray
@@ -22,6 +24,7 @@ class MLP(Mapping):
     :type output_dim: int
     :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
     :type hidden_dim: int or list of ints. 
+
     """
 
     def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
diff --git a/GPy/models/mrd.py b/GPy/models/mrd.py
index 99e50a19..be191e9b 100644
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@@ -39,6 +39,7 @@ class MRD(Model):
     :param num_inducing: number of inducing inputs to use
     :param kernels: list of kernels or kernel shared for all BGPLVMS
     :type kernels: [GPy.kern.kern] | GPy.kern.kern | None (default)
+
     """
     def __init__(self, likelihood_or_Y_list, input_dim, num_inducing=10, names=None,
                  kernels=None, initx='PCA',
@@ -338,8 +339,11 @@ class MRD(Model):
 
     def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
         """
-        :param:`titles` :
-            titles for axes of datasets
+
+        TODO: Explain other parameters
+
+        :param titles: titles for axes of datasets
+
         """
         if titles is None:
             titles = [r'${}$'.format(name) for name in self.names]
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 1effa9ce..3415c198 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -27,48 +27,56 @@ except:
     _blas_available = False
 
 def dtrtrs(A, B, lower=0, trans=0, unitdiag=0):
-    """Wrapper for lapack dtrtrs function
+    """
+    Wrapper for lapack dtrtrs function
 
     :param A: Matrix A
     :param B: Matrix B
     :param lower: is matrix lower (true) or upper (false)
     :returns:
+
     """
     return lapack.dtrtrs(A, B, lower=lower, trans=trans, unitdiag=unitdiag)
 
 def dpotrs(A, B, lower=0):
-    """Wrapper for lapack dpotrs function
+    """
+    Wrapper for lapack dpotrs function
 
     :param A: Matrix A
     :param B: Matrix B
     :param lower: is matrix lower (true) or upper (false)
     :returns:
+
     """
     return lapack.dpotrs(A, B, lower=lower)
 
 def dpotri(A, lower=0):
-    """Wrapper for lapack dpotri function
+    """
+    Wrapper for lapack dpotri function
 
     :param A: Matrix A
     :param lower: is matrix lower (true) or upper (false)
     :returns: A inverse
+
     """
     return lapack.dpotri(A, lower=lower)
 
 def trace_dot(a, b):
     """
-    efficiently compute the trace of the matrix product of a and b
+    Efficiently compute the trace of the matrix product of a and b
     """
     return np.sum(a * b)
 
 def mdot(*args):
-    """Multiply all the arguments using matrix product rules.
+    """
+    Multiply all the arguments using matrix product rules.
     The output is equivalent to multiplying the arguments one by one
     from left to right using dot().
     Precedence can be controlled by creating tuples of arguments,
     for instance mdot(a,((b,c),d)) multiplies a (a*((b*c)*d)).
     Note that this means the output of dot(a,b) and mdot(a,b) will differ if
     a or b is a pure tuple of numbers.
+
     """
     if len(args) == 1:
         return args[0]
@@ -119,10 +127,12 @@ def jitchol_old(A, maxtries=5):
 
     :rval L: the Cholesky decomposition of A
 
-    .. Note:
+    .. note:
+
       Adds jitter to K, to enforce positive-definiteness
       if stuff breaks, please check:
       np.allclose(sp.linalg.cholesky(XXT, lower = True), np.triu(sp.linalg.cho_factor(XXT)[0]).T)
+
     """
     try:
         return linalg.cholesky(A, lower=True)
@@ -142,6 +152,7 @@ def jitchol_old(A, maxtries=5):
 
 def pdinv(A, *args):
     """
+
     :param A: A DxD pd numpy array
 
     :rval Ai: the inverse of A
@@ -152,6 +163,7 @@ def pdinv(A, *args):
     :rtype Li: np.ndarray
     :rval logdet: the log of the determinant of A
     :rtype logdet: float64
+
     """
     L = jitchol(A, *args)
     logdet = 2.*np.sum(np.log(np.diag(L)))
@@ -177,14 +189,13 @@ def chol_inv(L):
 
 def multiple_pdinv(A):
     """
-    Arguments
-    ---------
     :param A: A DxDxN numpy array (each A[:,:,i] is pd)
 
-    Returns
-    -------
-    invs : the inverses of A
-    hld: 0.5* the log of the determinants of A
+    :rval invs: the inverses of A
+    :rtype invs: np.ndarray
+    :rval hld: 0.5* the log of the determinants of A
+    :rtype hld: np.array
+
     """
     N = A.shape[-1]
     chols = [jitchol(A[:, :, i]) for i in range(N)]
@@ -198,15 +209,13 @@ def PCA(Y, input_dim):
     """
     Principal component analysis: maximum likelihood solution by SVD
 
-    Arguments
-    ---------
     :param Y: NxD np.array of data
     :param input_dim: int, dimension of projection
 
-    Returns
-    -------
+
     :rval X: - Nxinput_dim np.array of dimensionality reduced data
-    W - input_dimxD mapping from X to Y
+    :rval W: - input_dimxD mapping from X to Y
+
     """
     if not np.allclose(Y.mean(axis=0), 0.0):
         print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
@@ -273,11 +282,10 @@ def DSYR_blas(A, x, alpha=1.):
     Performs a symmetric rank-1 update operation:
     A <- A + alpha * np.dot(x,x.T)
 
-    Arguments
-    ---------
     :param A: Symmetric NxN np.array
     :param x: Nx1 np.array
     :param alpha: scalar
+
     """
     N = c_int(A.shape[0])
     LDA = c_int(A.shape[0])
@@ -295,11 +303,10 @@ def DSYR_numpy(A, x, alpha=1.):
     Performs a symmetric rank-1 update operation:
     A <- A + alpha * np.dot(x,x.T)
 
-    Arguments
-    ---------
     :param A: Symmetric NxN np.array
     :param x: Nx1 np.array
     :param alpha: scalar
+
     """
     A += alpha * np.dot(x[:, None], x[None, :])
 
@@ -363,8 +370,9 @@ def cholupdate(L, x):
     """
     update the LOWER cholesky factor of a pd matrix IN PLACE
 
-    if L is the lower chol. of K, then this function computes L_
-    where L_ is the lower chol of K + x*x^T
+    if L is the lower chol. of K, then this function computes L\_
+    where L\_ is the lower chol of K + x*x^T
+
     """
     support_code = """
     #include <math.h>
diff --git a/GPy/util/mocap.py b/GPy/util/mocap.py
index 3d10fc0d..1446512d 100644
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@@ -92,13 +92,15 @@ class tree:
 
 
     def swap_vertices(self, i, j):
-        """Swap two vertices in the tree structure array.
+        """
+        Swap two vertices in the tree structure array.
         swap_vertex swaps the location of two vertices in a tree structure array. 
-         ARG tree : the tree for which two vertices are to be swapped.
-         ARG i : the index of the first vertex to be swapped.
-         ARG j : the index of the second vertex to be swapped.
-         RETURN tree : the tree structure with the two vertex locations
-         swapped.
+
+        :param tree: the tree for which two vertices are to be swapped.
+        :param i: the index of the first vertex to be swapped.
+        :param j: the index of the second vertex to be swapped.
+        :rval tree: the tree structure with the two vertex locations swapped.
+
         """
         store_vertex_i = self.vertices[i]
         store_vertex_j = self.vertices[j]
@@ -117,12 +119,16 @@ class tree:
 
 def rotation_matrix(xangle, yangle, zangle, order='zxy', degrees=False):
 
-    """Compute the rotation matrix for an angle in each direction.
+    """
+    Compute the rotation matrix for an angle in each direction.
     This is a helper function for computing the rotation matrix for a given set of angles in a given order.
-     ARG xangle : rotation for x-axis.
-     ARG yangle : rotation for y-axis.
-     ARG zangle : rotation for z-axis.
-     ARG order : the order for the rotations."""
+
+     :param xangle: rotation for x-axis.
+     :param yangle: rotation for y-axis.
+     :param zangle: rotation for z-axis.
+     :param order: the order for the rotations.
+
+     """
     if degrees:
         xangle = math.radians(xangle)
         yangle = math.radians(yangle)
@@ -301,10 +307,14 @@ class acclaim_skeleton(skeleton):
 
     def load_skel(self, file_name):
 
-        """Loads an ASF file into a skeleton structure.
+        """
+        Loads an ASF file into a skeleton structure.
         loads skeleton structure from an acclaim skeleton file.
-         ARG file_name : the file name to load in.
-         RETURN skel : the skeleton for the file."""         
+
+         :param file_name: the file name to load in.
+         :rval skel: the skeleton for the file. - TODO isn't returning this?
+
+         """         
 
         fid = open(file_name, 'r')
         self.read_skel(fid)
diff --git a/GPy/util/plot_latent.py b/GPy/util/plot_latent.py
index 81c3d6fc..62442650 100644
--- a/GPy/util/plot_latent.py
+++ b/GPy/util/plot_latent.py
@@ -15,7 +15,7 @@ def most_significant_input_dimensions(model, which_indices):
             try:
                 input_1, input_2 = np.argsort(model.input_sensitivity())[::-1][:2]
             except:
-                raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
+                raise ValueError, "cannot automatically determine which dimensions to plot, please pass 'which_indices'"
     else:
         input_1, input_2 = which_indices
     return input_1, input_2
diff --git a/doc/conf.py b/doc/conf.py
index 42def116..7b71a897 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -106,7 +106,7 @@ class Mock(object):
 print "Mocking"
 MOCK_MODULES = ['sympy',
     'sympy.utilities', 'sympy.utilities.codegen', 'sympy.core.cache',
-    'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser'
+    'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser', 'Tango', 'numdifftools'
     ]
 for mod_name in MOCK_MODULES:
     sys.modules[mod_name] = Mock()
diff --git a/doc/tuto_interacting_with_models.rst b/doc/tuto_interacting_with_models.rst
index 4a466bae..342fb24f 100644
--- a/doc/tuto_interacting_with_models.rst
+++ b/doc/tuto_interacting_with_models.rst
@@ -107,7 +107,7 @@ inputs: ::
 	m['iip'] = np.arange(-5,0)
 
 Getting the model's likelihood and gradients
-===========================================
+=============================================
 Appart form the printing the model,  the marginal 
 log-likelihood can be obtained by using the function
 ``log_likelihood()``. Also, the log-likelihood gradients