diff --git a/GPy/core/gp.py b/GPy/core/gp.py
index 2d826ac2..a3ef6c80 100644
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@@ -15,7 +15,7 @@ class GP(GPBase):
 
     :param X: input observations
     :param kernel: a GPy kernel, defaults to rbf+white
-    :parm likelihood: a GPy likelihood
+    :param likelihood: a GPy likelihood
     :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
     :type normalize_X: False|True
     :rtype: model object
@@ -132,17 +132,16 @@ class GP(GPBase):
     def predict(self, Xnew, which_parts='all', full_cov=False, likelihood_args=dict()):
         """
         Predict the function(s) at the new point(s) Xnew.
-        Arguments
-        ---------
+
         :param Xnew: The points at which to make a prediction
         :type Xnew: np.ndarray, Nnew x self.input_dim
         :param which_parts:  specifies which outputs kernel(s) to use in prediction
         :type which_parts: ('all', list of bools)
         :param full_cov: whether to return the full covariance matrix, or just the diagonal
         :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
 
 
            If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
@@ -160,8 +159,7 @@ class GP(GPBase):
     def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
         """
         For a specific output, predict the function at the new point(s) Xnew.
-        Arguments
-        ---------
+
         :param Xnew: The points at which to make a prediction
         :type Xnew: np.ndarray, Nnew x self.input_dim
         :param output: output to predict
@@ -170,9 +168,9 @@ class GP(GPBase):
         :type which_parts: ('all', list of bools)
         :param full_cov: whether to return the full covariance matrix, or just the diagonal
         :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+        :returns: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :returns: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
 
         .. Note:: For multiple output models only
         """
diff --git a/GPy/core/model.py b/GPy/core/model.py
index cb13378c..7aff8f4d 100644
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@@ -47,6 +47,7 @@ class Model(Parameterized):
 
         :param state: the state of the model.
         :type state: list as returned from getstate.
+
         """
         self.preferred_optimizer = state.pop()
         self.sampling_runs = state.pop()
@@ -543,10 +544,11 @@ class Model(Parameterized):
         """
         EM - like algorithm  for Expectation Propagation and Laplace approximation
 
-        :stop_crit: convergence criterion
+        :param stop_crit: convergence criterion
         :type stop_crit: float
 
-        ..Note: kwargs are passed to update_likelihood and optimize functions.        """
+        .. Note: kwargs are passed to update_likelihood and optimize functions. 
+        """
         assert isinstance(self.likelihood, likelihoods.EP) or isinstance(self.likelihood, likelihoods.EP_Mixed_Noise), "pseudo_EM is only available for EP likelihoods"
         ll_change = stop_crit + 1.
         iteration = 0
diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py
index f6a7b885..88bd36e6 100644
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@@ -393,8 +393,7 @@ class SparseGP(GPBase):
     def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
         """
         For a specific output, predict the function at the new point(s) Xnew.
-        Arguments
-        ---------
+
         :param Xnew: The points at which to make a prediction
         :type Xnew: np.ndarray, Nnew x self.input_dim
         :param output: output to predict
diff --git a/GPy/inference/sgd.py b/GPy/inference/sgd.py
index e443f45a..5cd144e8 100644
--- a/GPy/inference/sgd.py
+++ b/GPy/inference/sgd.py
@@ -10,11 +10,10 @@ class opt_SGD(Optimizer):
     """
     Optimize using stochastic gradient descent.
 
-    *** Parameters ***
-    Model: reference to the Model object
-    iterations: number of iterations
-    learning_rate: learning rate
-    momentum: momentum
+    :param Model: reference to the Model object
+    :param iterations: number of iterations
+    :param learning_rate: learning rate
+    :param momentum: momentum
 
     """
 
diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py
index 28066413..ddaf8d54 100644
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@@ -80,29 +80,30 @@ def gibbs(input_dim,variance=1., mapping=None):
 
     .. math::
 
-       r = sqrt((x_i - x_j)'*(x_i - x_j))
+       r = \\sqrt{((x_i - x_j)'*(x_i - x_j))}
 
-       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+       k(x_i, x_j) = \\sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
 
-       Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
+       Z = \\sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
 
-       where :math:`l(x)` is a function giving the length scale as a function of space.
-       This is the non stationary kernel proposed by Mark Gibbs in his 1997
-        thesis. It is similar to an RBF but has a length scale that varies
-        with input location. This leads to an additional term in front of
-        the kernel.
+    Where :math:`l(x)` is a function giving the length scale as a function of space.
 
-        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+    This is the non stationary kernel proposed by Mark Gibbs in his 1997
+    thesis. It is similar to an RBF but has a length scale that varies
+    with input location. This leads to an additional term in front of
+    the kernel.
 
-        :param input_dim: the number of input dimensions
-        :type input_dim: int
-        :param variance: the variance :math:`\sigma^2`
-        :type variance: float
-        :param mapping: the mapping that gives the lengthscale across the input space.
-        :type mapping: GPy.core.Mapping
-        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
-        :type ARD: Boolean
-        :rtype: Kernpart object
+    The parameters are :math:`\\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+
+    :param input_dim: the number of input dimensions
+    :type input_dim: int
+    :param variance: the variance :math:`\\sigma^2`
+    :type variance: float
+    :param mapping: the mapping that gives the lengthscale across the input space.
+    :type mapping: GPy.core.Mapping
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
+    :type ARD: Boolean
+    :rtype: Kernpart object
 
     """
     part = parts.gibbs.Gibbs(input_dim,variance,mapping)
diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py
index 6a72ac8d..9e930417 100644
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@@ -222,7 +222,8 @@ class kern(Parameterized):
 
     def prod(self, other, tensor=False):
         """
-        multiply two kernels (either on the same space, or on the tensor product of the input space).
+        Multiply two kernels (either on the same space, or on the tensor product of the input space).
+
         :param other: the other kernel to be added
         :type other: GPy.kern
         :param tensor: whether or not to use the tensor space (default is false).
diff --git a/GPy/likelihoods/ep.py b/GPy/likelihoods/ep.py
index 67bb79fc..8fdf423f 100644
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@@ -95,6 +95,7 @@ class EP(likelihood):
         :type epsilon: float
         :param power_ep: Power EP parameters
         :type power_ep: list of floats
+
         """
         self.epsilon = epsilon
         self.eta, self.delta = power_ep
@@ -165,6 +166,7 @@ class EP(likelihood):
         :type epsilon: float
         :param power_ep: Power EP parameters
         :type power_ep: list of floats
+
         """
         self.epsilon = epsilon
         self.eta, self.delta = power_ep
diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py
index cda62bfc..ca187305 100644
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@@ -10,14 +10,16 @@ class likelihood(Parameterized):
     (Gaussian) inherits directly from this, as does the EP algorithm
 
     Some things must be defined for this to work properly:
-    self.Y : the effective Gaussian target of the GP
-    self.N, self.D : Y.shape
-    self.covariance_matrix : the effective (noise) covariance of the GP targets
-    self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
-    self.is_heteroscedastic : enables significant computational savings in GP
-    self.precision : a scalar or vector representation of the effective target precision
-    self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
-    self.V : self.precision * self.Y
+
+        - self.Y : the effective Gaussian target of the GP
+        - self.N, self.D : Y.shape
+        - self.covariance_matrix : the effective (noise) covariance of the GP targets
+        - self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
+        - self.is_heteroscedastic : enables significant computational savings in GP
+        - self.precision : a scalar or vector representation of the effective target precision
+        - self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
+        - self.V : self.precision * self.Y
+
     """
     def __init__(self):
         Parameterized.__init__(self)
diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py
index e094d915..d4d29711 100644
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@@ -245,12 +245,13 @@ class BayesianGPLVM(SparseGP, GPLVM):
         """
         Plot latent space X in 1D:
 
-            -if fig is given, create input_dim subplots in fig and plot in these
-            -if ax is given plot input_dim 1D latent space plots of X into each `axis`
-            -if neither fig nor ax is given create a figure with fignum and plot in there
+            - if fig is given, create input_dim subplots in fig and plot in these
+            - if ax is given plot input_dim 1D latent space plots of X into each `axis`
+            - if neither fig nor ax is given create a figure with fignum and plot in there
 
         colors:
             colors of different latent space dimensions input_dim
+
         """
         import pylab
         if ax is None:
diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py
index 8afd1470..1164d7e6 100644
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@@ -524,11 +524,14 @@ def simulation_BGPLVM():
             'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}
 
 def toy_rbf_1d(seed=default_seed, num_samples=500):
-    """Samples values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1.
+    """
+    Samples values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1.
+
     :param seed: seed to use for random sampling.
     :type seed: int
     :param num_samples: number of samples to sample in the function (default 500).
     :type num_samples: int
+
     """
     np.random.seed(seed=seed)
     num_in = 1
@@ -631,11 +634,15 @@ def olympic_marathon_men(data_set='olympic_marathon_men'):
 
 
 def crescent_data(num_data=200, seed=default_seed):
-    """Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem.
+    """
+Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem.
+
     :param num_data_part: number of data to be sampled (default is 200).
     :type num_data: int
     :param seed: random seed to be used for data generation.
-    :type seed: int"""
+    :type seed: int
+
+    """
     np.random.seed(seed=seed)
     sqrt2 = np.sqrt(2)
     # Rotation matrix
diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py
index 3415c198..4e7f7fff 100644
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@@ -123,7 +123,7 @@ def jitchol(A, maxtries=5):
 
 def jitchol_old(A, maxtries=5):
     """
-    :param A : An almost pd square matrix
+    :param A: An almost pd square matrix
 
     :rval L: the Cholesky decomposition of A
 
diff --git a/GPy/util/misc.py b/GPy/util/misc.py
index 29d69848..72edf99f 100644
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@@ -17,12 +17,9 @@ def linear_grid(D, n = 100, min_max = (-100, 100)):
     """
     Creates a D-dimensional grid of n linearly spaced points
 
-    Parameters:
-
-    D:        dimension of the grid
-    n:        number of points
-    min_max:  (min, max) list
-
+    :param D: dimension of the grid
+    :param n: number of points
+    :param min_max: (min, max) list
 
     """
 
@@ -39,6 +36,7 @@ def kmm_init(X, m = 10):
 
     :param X: data
     :param m: number of inducing points
+
     """
 
     # compute the distances
diff --git a/GPy/util/mocap.py b/GPy/util/mocap.py
index 1446512d..78f00955 100644
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@@ -120,13 +120,14 @@ class tree:
 def rotation_matrix(xangle, yangle, zangle, order='zxy', degrees=False):
 
     """
+
     Compute the rotation matrix for an angle in each direction.
     This is a helper function for computing the rotation matrix for a given set of angles in a given order.
 
-     :param xangle: rotation for x-axis.
-     :param yangle: rotation for y-axis.
-     :param zangle: rotation for z-axis.
-     :param order: the order for the rotations.
+    :param xangle: rotation for x-axis.
+    :param yangle: rotation for y-axis.
+    :param zangle: rotation for z-axis.
+    :param order: the order for the rotations.
 
      """
     if degrees:
@@ -309,10 +310,8 @@ class acclaim_skeleton(skeleton):
 
         """
         Loads an ASF file into a skeleton structure.
-        loads skeleton structure from an acclaim skeleton file.
 
-         :param file_name: the file name to load in.
-         :rval skel: the skeleton for the file. - TODO isn't returning this?
+        :param file_name: The file name to load in.
 
          """         
 
diff --git a/GPy/util/visualize.py b/GPy/util/visualize.py
index 4c3dbe2b..7a519555 100644
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@@ -502,11 +502,14 @@ def data_play(Y, visualizer, frame_rate=30):
 
     This example loads in the CMU mocap database (http://mocap.cs.cmu.edu) subject number 35 motion number 01. It then plays it using the mocap_show visualize object.
     
-    data = GPy.util.datasets.cmu_mocap(subject='35', train_motions=['01'])
-    Y = data['Y']
-    Y[:, 0:3] = 0.   # Make figure walk in place
-    visualize = GPy.util.visualize.skeleton_show(Y[0, :], data['skel'])
-    GPy.util.visualize.data_play(Y, visualize)
+    .. code-block:: python
+
+       data = GPy.util.datasets.cmu_mocap(subject='35', train_motions=['01'])
+       Y = data['Y']
+       Y[:, 0:3] = 0.   # Make figure walk in place
+       visualize = GPy.util.visualize.skeleton_show(Y[0, :], data['skel'])
+       GPy.util.visualize.data_play(Y, visualize)
+
     """
     
 
diff --git a/GPy/util/warping_functions.py b/GPy/util/warping_functions.py
index f36805a9..e05f39af 100644
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@@ -53,9 +53,11 @@ class TanhWarpingFunction(WarpingFunction):
         self.num_parameters = 3 * self.n_terms
 
     def f(self,y,psi):
-        """transform y with f using parameter vector psi
+        """
+        transform y with f using parameter vector psi
         psi = [[a,b,c]]
-        f = \sum_{terms} a * tanh(b*(y+c))
+        ::math::`f = \\sum_{terms} a * tanh(b*(y+c))`
+
         """
 
         #1. check that number of params is consistent
@@ -77,8 +79,7 @@ class TanhWarpingFunction(WarpingFunction):
         """
         calculate the numerical inverse of f
 
-        == input ==
-        iterations: number of N.R. iterations
+        :param iterations: number of N.R. iterations
 
         """
 
@@ -165,9 +166,11 @@ class TanhWarpingFunction_d(WarpingFunction):
         self.num_parameters = 3 * self.n_terms + 1
 
     def f(self,y,psi):
-        """transform y with f using parameter vector psi
+        """
+        Transform y with f using parameter vector psi
         psi = [[a,b,c]]
-        f = \sum_{terms} a * tanh(b*(y+c))
+
+        :math:`f = \\sum_{terms} a * tanh(b*(y+c))`
         """
 
         #1. check that number of params is consistent
@@ -189,8 +192,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         """
         calculate the numerical inverse of f
 
-        == input ==
-        iterations: number of N.R. iterations
+        :param max_iterations: maximum number of N.R. iterations
 
         """
 
@@ -214,12 +216,13 @@ class TanhWarpingFunction_d(WarpingFunction):
     def fgrad_y(self, y, psi, return_precalc = False):
         """
         gradient of f w.r.t to y ([N x 1])
-        returns: Nx1 vector of derivatives, unless return_precalc is true,
-        then it also returns the precomputed stuff
+
+        :returns: Nx1 vector of derivatives, unless return_precalc is true, then it also returns the precomputed stuff
+
         """
 
 
-        mpsi = psi.copy()
+        mpsi = psi.coSpy()
         d = psi[-1]
         mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
 
@@ -242,7 +245,7 @@ class TanhWarpingFunction_d(WarpingFunction):
         """
         gradient of f w.r.t to y and psi
 
-        returns: NxIx4 tensor of partial derivatives
+        :returns: NxIx4 tensor of partial derivatives
 
         """