Merge branch 'devel' into params

2026-06-26 15:49:40 +02:00 · 2013-10-07 08:20:29 +01:00 · 2013-10-07 08:20:29 +01:00 · 4f56506aa6
commit 4f56506aa6
parent 857f9c0b6e a0b58020a4
60 changed files with 1944 additions and 596 deletions
--- a/GPy/FAQ.txt
+++ b/GPy/FAQ.txt
@ -0,0 +1,8 @@
 Frequently Asked Questions
 --------------------------
 Unit tests are run through Travis-Ci. They can be run locally through entering the GPy route diretory and writing
 nosetests testing/
 Documentation is handled by Sphinx. To build the documentation:
--- a/GPy/coding_style_guide.txt
+++ b/GPy/coding_style_guide.txt
@ -0,0 +1,10 @@
 In this text document we will describe coding conventions to be used in GPy to keep things consistent.
 All arrays containing data are two dimensional. The first dimension is the number of data, the second dimension is number of features. This keeps things consistent with the idea of a design matrix.
 Input matrices are either X or t, output matrices are Y.
 Input dimensionality is input_dim, output dimensionality is output_dim, number of data is num_data.
 Data sets are preprocessed in the datasets.py file. This file also records where the data set was obtained from in the dictionary stored in the file. Long term we should move this dictionary to sqlite or similar.
--- a/GPy/core/fitc.py
+++ b/GPy/core/fitc.py
@ -11,25 +11,27 @@ from sparse_gp import SparseGP
 class FITC(SparseGP):
    """
-    sparse FITC approximation
+
    Sparse FITC approximation
    :param X: inputs
    :type X: np.ndarray (num_data x Q)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP)
-    :param kernel : the kernel (covariance function). See link kernels
+    :param kernel: the kernel (covariance function). See link kernels
    :type kernel: a GPy.kern.kern instance
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, likelihood, kernel, Z, normalize_X=False):
        SparseGP.__init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False)
        assert self.output_dim == 1, "FITC model is not defined for handling multiple outputs"
-    def update_likelihood_approximation(self):
+    def update_likelihood_approximation(self, **kwargs):
        """
        Approximates a non-Gaussian likelihood using Expectation Propagation
@ -37,7 +39,7 @@ class FITC(SparseGP):
        this function does nothing
        """
        self.likelihood.restart()
-        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
+        self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0, **kwargs)
        self._set_params(self._get_params())
    def _compute_kernel_matrices(self):
@ -120,7 +122,7 @@ class FITC(SparseGP):
            _dKmm = .5*(V_n**2 + alpha_n + gamma_n**2 - 2.*gamma_k) * K_pp_K #Diag_dD_dKmm
            self._dpsi1_dtheta += self.kern.dK_dtheta(_dpsi1,self.X[i:i+1,:],self.Z)
            self._dKmm_dtheta += self.kern.dK_dtheta(_dKmm,self.Z)
-            self._dKmm_dX += 2.*self.kern.dK_dX(_dKmm ,self.Z)
+            self._dKmm_dX += self.kern.dK_dX(_dKmm ,self.Z)
            self._dpsi1_dX += self.kern.dK_dX(_dpsi1.T,self.Z,self.X[i:i+1,:])
        # the partial derivative vector for the likelihood
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -15,7 +15,7 @@ class GP(GPBase):
    :param X: input observations
    :param kernel: a GPy kernel, defaults to rbf+white
-    :parm likelihood: a GPy likelihood
+    :param likelihood: a GPy likelihood
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :rtype: model object
@ -62,7 +62,7 @@ class GP(GPBase):
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
-    def update_likelihood_approximation(self):
+    def update_likelihood_approximation(self, **kwargs):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
@ -70,7 +70,7 @@ class GP(GPBase):
        this function does nothing
        """
        self.likelihood.restart()
-        self.likelihood.fit_full(self.kern.K(self.X))
+        self.likelihood.fit_full(self.kern.K(self.X), **kwargs)
        self._set_params(self._get_params()) # update the GP
    def _model_fit_term(self):
@ -132,17 +132,16 @@ class GP(GPBase):
    def predict(self, Xnew, which_parts='all', full_cov=False, likelihood_args=dict()):
        """
        Predict the function(s) at the new point(s) Xnew.
-        Arguments
+
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
@ -160,8 +159,7 @@ class GP(GPBase):
    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
        """
        For a specific output, predict the function at the new point(s) Xnew.
-        Arguments
+
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param output: output to predict
@ -170,9 +168,9 @@ class GP(GPBase):
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
-        :rtype: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :returns: posterior mean,  a Numpy array, Nnew x self.input_dim
-        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
+        :returns: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
+        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
        .. Note:: For multiple output models only
        """
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -128,10 +128,9 @@ class GPBase(Model):
            else:
                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
        else:
-            assert self.num_outputs > output, 'The model has only %s outputs.' %self.num_outputs
+            assert len(self.likelihood.noise_model_list) > output, 'The model has only %s outputs.' %self.num_outputs
            if self.X.shape[1] == 2:
                assert self.num_outputs >= output, 'The model has only %s outputs.' %self.num_outputs
                Xu = self.X[self.X[:,-1]==output ,0:1]
                Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
@ -263,7 +262,7 @@ class GPBase(Model):
                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
        else:
-            assert self.num_outputs > output, 'The model has only %s outputs.' %self.num_outputs
+            assert len(self.likelihood.noise_model_list) > output, 'The model has only %s outputs.' %self.num_outputs
            if self.X.shape[1] == 2:
                resolution = resolution or 200
                Xu = self.X[self.X[:,-1]==output,:] #keep the output of interest
@ -287,3 +286,20 @@ class GPBase(Model):
            else:
                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    """
    def samples_f(self,X,samples=10, which_data='all', which_parts='all',output=None):
        if which_data == 'all':
            which_data = slice(None)
        if hasattr(self,'multioutput'):
            np.hstack([X,np.ones((X.shape[0],1))*output])
        m, v = self._raw_predict(X, which_parts=which_parts, full_cov=True)
        v = v.reshape(m.size,-1) if len(v.shape)==3 else v
        Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
        #gpplot(X, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None, ], axes=ax)
        for i in range(samples):
            ax.plot(X, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
    """
--- a/GPy/core/mapping.py
+++ b/GPy/core/mapping.py
@ -49,6 +49,7 @@ class Mapping(Parameterized):
    def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']):
        """
        Plot the mapping.
        Plots the mapping associated with the model.
@ -79,8 +80,7 @@ class Mapping(Parameterized):
        :type fixed_inputs: a list of tuples
        :param linecol: color of line to plot.
        :type linecol:
-        :param levels: for 2D plotting, the number of contour levels to use
+        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        is ax is None, create a new figure
        """
        # TODO include samples
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -47,6 +47,7 @@ class Model(Parameterized):
        :param state: the state of the model.
        :type state: list as returned from getstate.
        """
        self.preferred_optimizer = state.pop()
        self.sampling_runs = state.pop()
@ -56,10 +57,11 @@ class Model(Parameterized):
    def set_prior(self, regexp, what):
        """
        Sets priors on the model parameters.
-        Notes
+        **Notes**
-        -----
+
        Asserts that the prior is suitable for the constraint. If the
        wrong constraint is in place, an error is raised.  If no
        constraint is in place, one is added (warning printed).
@ -185,8 +187,8 @@ class Model(Parameterized):
        be handled silently.  If _all_ runs fail, the model is reset to the
        existing parameter values.
-        Notes
+        **Notes**
-        -----
+
        :param num_restarts: number of restarts to use (default 10)
        :type num_restarts: int
        :param robust: whether to handle exceptions silently or not (default False)
@ -195,7 +197,9 @@ class Model(Parameterized):
        :type parallel: bool
        :param num_processes: number of workers in the multiprocessing pool
        :type numprocesses: int
-        **kwargs are passed to the optimizer. They can be:
+
        \*\*kwargs are passed to the optimizer. They can be:
        :param max_f_eval: maximum number of function evaluations
        :type max_f_eval: int
        :param max_iters: maximum number of iterations
@ -203,9 +207,7 @@ class Model(Parameterized):
        :param messages: whether to display during optimisation
        :type messages: bool
-        ..Note: If num_processes is None, the number of workes in the multiprocessing pool is automatically
+        .. note:: If num_processes is None, the number of workes in the multiprocessing pool is automatically set to the number of processors on the current machine.
        set to the number of processors on the current machine.
        """
        initial_parameters = self._get_params_transformed()
@ -538,22 +540,17 @@ class Model(Parameterized):
            return k.variances
-    def pseudo_EM(self, epsilon=.1, **kwargs):
+    def pseudo_EM(self, stop_crit=.1, **kwargs):
        """
        TODO: Should this not bein the GP class?
        EM - like algorithm  for Expectation Propagation and Laplace approximation
-        kwargs are passed to the optimize function. They can be:
+        :param stop_crit: convergence criterion
-
+        :type stop_crit: float
        :epsilon: convergence criterion
        :max_f_eval: maximum number of function evaluations
        :messages: whether to display during optimisation
        :param optimzer: whice optimizer to use (defaults to self.preferred optimizer)
        :type optimzer: string TODO: valid strings?
        .. Note: kwargs are passed to update_likelihood and optimize functions. 
        """
        assert isinstance(self.likelihood, likelihoods.EP) or isinstance(self.likelihood, likelihoods.EP_Mixed_Noise), "pseudo_EM is only available for EP likelihoods"
-        ll_change = epsilon + 1.
+        ll_change = stop_crit + 1.
        iteration = 0
        last_ll = -np.inf
@ -561,10 +558,25 @@ class Model(Parameterized):
        alpha = 0
        stop = False
        #Handle **kwargs
        ep_args = {}
        for arg in kwargs.keys():
            if arg in ('epsilon','power_ep'):
                ep_args[arg] = kwargs[arg]
                del kwargs[arg]
        while not stop:
            last_approximation = self.likelihood.copy()
            last_params = self._get_params()
-            self.update_likelihood_approximation()
+            if len(ep_args) == 2:
                self.update_likelihood_approximation(epsilon=ep_args['epsilon'],power_ep=ep_args['power_ep'])
            elif len(ep_args) == 1:
                if  ep_args.keys()[0] == 'epsilon':
                    self.update_likelihood_approximation(epsilon=ep_args['epsilon'])
                elif ep_args.keys()[0] == 'power_ep':
                    self.update_likelihood_approximation(power_ep=ep_args['power_ep'])
            else:
                self.update_likelihood_approximation()
            new_ll = self.log_likelihood()
            ll_change = new_ll - last_ll
@ -576,7 +588,7 @@ class Model(Parameterized):
            else:
                self.optimize(**kwargs)
                last_ll = self.log_likelihood()
-                if ll_change < epsilon:
+                if ll_change < stop_crit:
                    stop = True
            iteration += 1
            if stop:
--- a/GPy/core/parameterized.py
+++ b/GPy/core/parameterized.py
@ -231,17 +231,19 @@ class Parameterized(object):
    def constrain_fixed(self, regexp, value=None):
        """
-        Arguments
+
        ---------
        :param regexp: which parameters need to be fixed.
        :type regexp: ndarray(dtype=int) or regular expression object or string
        :param value: the vlaue to fix the parameters to. If the value is not specified,
                 the parameter is fixed to the current value
        :type value: float
-        Notes
+
-        -----
+        **Notes**
        Fixing a parameter which is tied to another, or constrained in some way will result in an error.
-        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes
+
        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes.
        """
        matches = self.grep_param_names(regexp)
        overlap = set(matches).intersection(set(self.all_constrained_indices()))
--- a/GPy/core/sparse_gp.py
+++ b/GPy/core/sparse_gp.py
@ -16,16 +16,17 @@ class SparseGP(GPBase):
    :type X: np.ndarray (num_data x input_dim)
    :param likelihood: a likelihood instance, containing the observed data
    :type likelihood: GPy.likelihood.(Gaussian | EP | Laplace)
-    :param kernel : the kernel (covariance function). See link kernels
+    :param kernel: the kernel (covariance function). See link kernels
    :type kernel: a GPy.kern.kern instance
    :param X_variance: The uncertainty in the measurements of X (Gaussian variance)
    :type X_variance: np.ndarray (num_data x input_dim) | None
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (num_inducing x input_dim) | None
-    :param num_inducing : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :param num_inducing: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type num_inducing: int
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
@ -215,7 +216,7 @@ class SparseGP(GPBase):
    #def _get_print_names(self):
    #    return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
-    def update_likelihood_approximation(self):
+    def update_likelihood_approximation(self, **kwargs):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
@ -229,10 +230,10 @@ class SparseGP(GPBase):
                Kmmi = tdot(Lmi.T)
                diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)])
-                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
+                self.likelihood.fit_FITC(self.Kmm, self.psi1.T, diag_tr_psi2Kmmi, **kwargs) # This uses the fit_FITC code, but does not perfomr a FITC-EP.#TODO solve potential confusion
                # raise NotImplementedError, "EP approximation not implemented for uncertain inputs"
            else:
-                self.likelihood.fit_DTC(self.Kmm, self.psi1.T)
+                self.likelihood.fit_DTC(self.Kmm, self.psi1.T, **kwargs)
                # self.likelihood.fit_FITC(self.Kmm,self.psi1,self.psi0)
                self._set_params(self._get_params()) # update the GP
@ -292,7 +293,7 @@ class SparseGP(GPBase):
                Kxx = self.kern.Kdiag(Xnew, which_parts=which_parts)
                var = Kxx - np.sum(Kx * np.dot(Kmmi_LmiBLmi, Kx), 0)
        else:
-            # assert which_p.Tarts=='all', "swithching out parts of variational kernels is not implemented"
+            # assert which_parts=='all', "swithching out parts of variational kernels is not implemented"
            Kx = self.kern.psi1(self.Z, Xnew, X_variance_new) # , which_parts=which_parts) TODO: which_parts
            mu = np.dot(Kx, self.Cpsi1V)
            if full_cov:
@ -306,10 +307,11 @@ class SparseGP(GPBase):
    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.
-        Arguments
+        **Arguments**
-        ---------
+
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param X_variance_new: The uncertainty in the prediction points
@ -365,9 +367,8 @@ class SparseGP(GPBase):
                ax.plot(Zu[:, 0], Zu[:, 1], 'wo')
        else:
            pass
            """
            if self.X.shape[1] == 2 and hasattr(self,'multioutput'):
                """
                Xu = self.X[self.X[:,-1]==output,:]
                if self.has_uncertain_inputs:
                    Xu = self.X * self._Xscale + self._Xoffset  # NOTE self.X are the normalized values now
@ -378,6 +379,7 @@ class SparseGP(GPBase):
                                xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                                ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
                """
                Zu = self.Z[self.Z[:,-1]==output,:]
                Zu = self.Z * self._Xscale + self._Xoffset
                Zu = self.Z[self.Z[:,-1]==output ,0:1] #??
@ -386,13 +388,11 @@ class SparseGP(GPBase):
            else:
                raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
            """
    def predict_single_output(self, Xnew, output=0, which_parts='all', full_cov=False):
        """
        For a specific output, predict the function at the new point(s) Xnew.
-        Arguments
+
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param output: output to predict
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -14,6 +14,7 @@ import sys
 class SVIGP(GPBase):
    """
    Stochastic Variational inference in a Gaussian Process
    :param X: inputs
@ -22,25 +23,26 @@ class SVIGP(GPBase):
    :type Y: np.ndarray of observations (N x D)
    :param batchsize: the size of a h
-    Additional kwargs are used as for a sparse GP. They include
+    Additional kwargs are used as for a sparse GP. They include:
    :param q_u: canonical parameters of the distribution squasehd into a 1D array
    :type q_u: np.ndarray
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
-    :param kernel : the kernel/covariance function. See link kernels
+    :param kernel: the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
-    :param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
+    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
-    :param beta: noise precision. TODO> ignore beta if doing EP
+    :param beta: noise precision. TODO: ignore beta if doing EP
    :type beta: float
-    :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
+    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -10,31 +10,11 @@ import numpy as np
 import GPy
 default_seed = 10000
 def crescent_data(seed=default_seed, kernel=None): # FIXME
    """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
    :param seed : seed value for data generation.
    :type seed: int
    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
    :type inducing: int
    """
    data = GPy.util.datasets.crescent_data(seed=seed)
    Y = data['Y']
    Y[Y.flatten()==-1] = 0
    m = GPy.models.GPClassification(data['X'], Y)
    #m.update_likelihood_approximation()
    #m.optimize()
    m.pseudo_EM()
    print(m)
    m.plot()
    return m
 def oil(num_inducing=50, max_iters=100, kernel=None):
    """
    Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    """
    data = GPy.util.datasets.oil()
    X = data['X']
@ -64,8 +44,10 @@ def oil(num_inducing=50, max_iters=100, kernel=None):
 def toy_linear_1d_classification(seed=default_seed):
    """
    Simple 1D classification example
-    :param seed : seed value for data generation (default is 4).
+
    :param seed: seed value for data generation (default is 4).
    :type seed: int
    """
    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
@ -92,8 +74,10 @@ def toy_linear_1d_classification(seed=default_seed):
 def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    """
    Sparse 1D classification example
-    :param seed : seed value for data generation (default is 4).
+
    :param seed: seed value for data generation (default is 4).
    :type seed: int
    """
    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
@ -118,61 +102,13 @@ def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed):
    return m
 def sparse_crescent_data(num_inducing=10, seed=default_seed, kernel=None):
    """
    Run a Gaussian process classification with DTC approxiamtion on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
    :param seed : seed value for data generation.
    :type seed: int
    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
    :type inducing: int
    """
    data = GPy.util.datasets.crescent_data(seed=seed)
    Y = data['Y']
    Y[Y.flatten()==-1]=0
    m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
    m['.*len'] = 10.
    #m.update_likelihood_approximation()
    #m.optimize()
    m.pseudo_EM()
    print(m)
    m.plot()
    return m
 def FITC_crescent_data(num_inducing=10, seed=default_seed):
    """
    Run a Gaussian process classification with FITC approximation on the crescent data. The demonstration uses EP to approximate the likelihood.
    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
    :param seed : seed value for data generation.
    :type seed: int
    :param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
    :type num_inducing: int
    """
    data = GPy.util.datasets.crescent_data(seed=seed)
    Y = data['Y']
    Y[Y.flatten()==-1]=0
    m = GPy.models.FITCClassification(data['X'], Y,num_inducing=num_inducing)
    m.constrain_bounded('.*len',1.,1e3)
    m['.*len'] = 3.
    #m.update_likelihood_approximation()
    #m.optimize()
    m.pseudo_EM()
    print(m)
    m.plot()
    return m
 def toy_heaviside(seed=default_seed):
    """
    Simple 1D classification example using a heavy side gp transformation
-    :param seed : seed value for data generation (default is 4).
+
    :param seed: seed value for data generation (default is 4).
    :type seed: int
    """
    data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
@ -198,3 +134,35 @@ def toy_heaviside(seed=default_seed):
    return m
 def crescent_data(model_type='Full', num_inducing=10, seed=default_seed, kernel=None):
    """
    Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.
    :param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
    :param inducing: number of inducing variables (only used for 'FITC' or 'DTC').
    :type inducing: int
    :param seed: seed value for data generation.
    :type seed: int
    :param kernel: kernel to use in the model
    :type kernel: a GPy kernel
    """
    data = GPy.util.datasets.crescent_data(seed=seed)
    Y = data['Y']
    Y[Y.flatten()==-1] = 0
    if model_type == 'Full':
        m = GPy.models.GPClassification(data['X'], Y,kernel=kernel)
    elif model_type == 'DTC':
        m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
        m['.*len'] = 10.
    elif model_type == 'FITC':
        m = GPy.models.FITCClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing)
        m['.*len'] = 3.
    m.pseudo_EM()
    print(m)
    m.plot()
    return m
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -132,7 +132,7 @@ def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=1000
    length_scales = np.linspace(0.1, 60., resolution)
    log_SNRs = np.linspace(-3., 4., resolution)
-    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number)
+    data = GPy.util.datasets.della_gatta_TRP63_gene_expression(data_set='della_gatta',gene_number=gene_number)
    # data['Y'] = data['Y'][0::2, :]
    # data['X'] = data['X'][0::2, :]
--- a/GPy/inference/conjugate_gradient_descent.py
+++ b/GPy/inference/conjugate_gradient_descent.py
@ -233,7 +233,7 @@ class CGD(Async_Optimize):
        """
        opt_async(self, f, df, x0, callback, update_rule=FletcherReeves,
               messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
-               report_every=10, *args, **kwargs)
+               report_every=10, \*args, \*\*kwargs)
        callback gets called every `report_every` iterations
@ -244,16 +244,14 @@ class CGD(Async_Optimize):
        f, and df will be called with
-            f(xi, *args, **kwargs)
+            f(xi, \*args, \*\*kwargs)
-            df(xi, *args, **kwargs)
+            df(xi, \*args, \*\*kwargs)
-        **returns**
+        **Returns:**
        -----------
            Started `Process` object, optimizing asynchronously 
-        **calls** 
+        **Calls:** 
        ---------
            callback(x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message)
@ -265,7 +263,7 @@ class CGD(Async_Optimize):
        """
        opt(self, f, df, x0, callback=None, update_rule=FletcherReeves,
               messages=0, maxiter=5e3, max_f_eval=15e3, gtol=1e-6,
-               report_every=10, *args, **kwargs)
+               report_every=10, \*args, \*\*kwargs)
        Minimize f, calling callback every `report_every` iterations with following syntax:
@ -276,11 +274,10 @@ class CGD(Async_Optimize):
        f, and df will be called with
-            f(xi, *args, **kwargs)
+            f(xi, \*args, \*\*kwargs)
-            df(xi, *args, **kwargs)
+            df(xi, \*args, \*\*kwargs)
        **returns** 
        ---------
            x_opt, f_opt, g_opt, iteration, function_calls, gradient_calls, status_message
--- a/GPy/inference/optimization.py
+++ b/GPy/inference/optimization.py
@ -29,7 +29,7 @@ class Optimizer():
    """
    def __init__(self, x_init, messages=False, model=None, max_f_eval=1e4, max_iters=1e3,
-                 ftol=None, gtol=None, xtol=None):
+                 ftol=None, gtol=None, xtol=None, bfgs_factor=None):
        self.opt_name = None
        self.x_init = x_init
        self.messages = messages
@ -39,6 +39,7 @@ class Optimizer():
        self.status = None
        self.max_f_eval = int(max_f_eval)
        self.max_iters = int(max_iters)
        self.bfgs_factor = bfgs_factor
        self.trace = None
        self.time = "Not available"
        self.xtol = xtol
@ -128,6 +129,8 @@ class opt_lbfgsb(Optimizer):
            print "WARNING: l-bfgs-b doesn't have an ftol arg, so I'm going to ignore it"
        if self.gtol is not None:
            opt_dict['pgtol'] = self.gtol
        if self.bfgs_factor is not None:
            opt_dict['factr'] = self.bfgs_factor
        opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint,
                                            maxfun=self.max_iters, **opt_dict)
--- a/GPy/inference/sgd.py
+++ b/GPy/inference/sgd.py
@ -10,11 +10,10 @@ class opt_SGD(Optimizer):
    """
    Optimize using stochastic gradient descent.
-    *** Parameters ***
+    :param Model: reference to the Model object
-    Model: reference to the Model object
+    :param iterations: number of iterations
-    iterations: number of iterations
+    :param learning_rate: learning rate
-    learning_rate: learning rate
+    :param momentum: momentum
    momentum: momentum
    """
--- a/GPy/kern/constructors.py
+++ b/GPy/kern/constructors.py
@ -17,6 +17,7 @@ def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
    part = parts.rbf_inv.RBFInv(input_dim,variance,inv_lengthscale,ARD)
    return kern(input_dim, [part])
@ -33,6 +34,7 @@ def rbf(input_dim,variance=1., lengthscale=None,ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
    part = parts.rbf.RBF(input_dim,variance,lengthscale,ARD)
    return kern(input_dim, [part])
@ -41,11 +43,13 @@ def linear(input_dim,variances=None,ARD=False):
    """
     Construct a linear kernel.
-     Arguments
+    :param input_dim: dimensionality of the kernel, obligatory
-     ---------
+    :type input_dim: int
-    input_dimD (int), obligatory
+    :param variances:
-     variances (np.ndarray)
+    :type variances: np.ndarray
-     ARD (boolean)
+    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
    part = parts.linear.Linear(input_dim,variances,ARD)
    return kern(input_dim, [part])
@ -64,39 +68,42 @@ def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False
    :type bias_variance: float
    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
    :type ARD: Boolean
    """
    part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
    return kern(input_dim, [part])
 def gibbs(input_dim,variance=1., mapping=None):
    """
    Gibbs and MacKay non-stationary covariance function.
    .. math::
-       r = sqrt((x_i - x_j)'*(x_i - x_j))
+       r = \\sqrt{((x_i - x_j)'*(x_i - x_j))}
-       k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
+       k(x_i, x_j) = \\sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x')))
-       Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
+       Z = \\sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')}
-       where :math:`l(x)` is a function giving the length scale as a function of space.
+    Where :math:`l(x)` is a function giving the length scale as a function of space.
       This is the non stationary kernel proposed by Mark Gibbs in his 1997
        thesis. It is similar to an RBF but has a length scale that varies
        with input location. This leads to an additional term in front of
        the kernel.
-        The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
+    This is the non stationary kernel proposed by Mark Gibbs in his 1997
    thesis. It is similar to an RBF but has a length scale that varies
    with input location. This leads to an additional term in front of
    the kernel.
-        :param input_dim: the number of input dimensions
+    The parameters are :math:`\\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used.
-        :type input_dim: int
+
-        :param variance: the variance :math:`\sigma^2`
+    :param input_dim: the number of input dimensions
-        :type variance: float
+    :type input_dim: int
-        :param mapping: the mapping that gives the lengthscale across the input space.
+    :param variance: the variance :math:`\\sigma^2`
-        :type mapping: GPy.core.Mapping
+    :type variance: float
-        :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :param mapping: the mapping that gives the lengthscale across the input space.
-        :type ARD: Boolean
+    :type mapping: GPy.core.Mapping
-        :rtype: Kernpart object
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
    :type ARD: Boolean
    :rtype: Kernpart object
    """
    part = parts.gibbs.Gibbs(input_dim,variance,mapping)
@ -124,6 +131,7 @@ def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2,
    :type degree: int
    :param ARD: Auto Relevance Determination (allows for ARD version of covariance)
    :type ARD: Boolean
    """
    part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD)
    return kern(input_dim, [part])
@ -132,14 +140,42 @@ def white(input_dim,variance=1.):
    """
     Construct a white kernel.
-     Arguments
+    :param input_dim: dimensionality of the kernel, obligatory
-     ---------
+    :type input_dim: int
-    input_dimD (int), obligatory
+    :param variance: the variance of the kernel
-     variance (float)
+    :type variance: float
    """
    part = parts.white.White(input_dim,variance)
    return kern(input_dim, [part])
 def eq_ode1(output_dim, W=None, rank=1,  kappa=None, length_scale=1., decay=None, delay=None):
    """Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
    This outputs of this kernel have the form
    .. math::
       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} f_i(t-\delta_j) +\sqrt{\kappa_j}g_j(t) - d_jy_j(t)
    where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
    :param output_dim: number of outputs driven by latent function.
    :type output_dim: int
    :param W: sensitivities of each output to the latent driving function. 
    :type W: ndarray (output_dim x rank).
    :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
    :type rank: int
    :param decay: decay rates for the first order system. 
    :type decay: array of length output_dim.
    :param delay: delay between latent force and output response.
    :type delay: array of length output_dim.
    :param kappa: diagonal term that allows each latent output to have an independent component to the response.
    :type kappa: array of length output_dim.
    .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
    """
    part = parts.eq_ode1.Eq_ode1(output_dim, W, rank, kappa, length_scale, decay, delay)
    return kern(2, [part])
 def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
    """
@ -153,6 +189,7 @@ def exponential(input_dim,variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
    part = parts.exponential.Exponential(input_dim,variance, lengthscale, ARD)
    return kern(input_dim, [part])
@ -169,6 +206,7 @@ def Matern32(input_dim,variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
    part = parts.Matern32.Matern32(input_dim,variance, lengthscale, ARD)
    return kern(input_dim, [part])
@ -185,6 +223,7 @@ def Matern52(input_dim, variance=1., lengthscale=None, ARD=False):
    :type lengthscale: float
    :param ARD: Auto Relevance Determination (one lengthscale per dimension)
    :type ARD: Boolean
    """
    part = parts.Matern52.Matern52(input_dim, variance, lengthscale, ARD)
    return kern(input_dim, [part])
@ -193,10 +232,11 @@ def bias(input_dim, variance=1.):
    """
     Construct a bias kernel.
-     Arguments
+    :param input_dim: dimensionality of the kernel, obligatory
-     ---------
+    :type input_dim: int
-     input_dim (int), obligatory
+    :param variance: the variance of the kernel
-     variance (float)
+    :type variance: float
    """
    part = parts.bias.Bias(input_dim, variance)
    return kern(input_dim, [part])
@ -204,10 +244,15 @@ def bias(input_dim, variance=1.):
 def finite_dimensional(input_dim, F, G, variances=1., weights=None):
    """
    Construct a finite dimensional kernel.
-    input_dim: int - the number of input dimensions
+
-    F: np.array of functions with shape (n,) - the n basis functions
+    :param input_dim: the number of input dimensions
-    G: np.array with shape (n,n) - the Gram matrix associated to F
+    :type input_dim: int
-    variances : np.ndarray with shape (n,)
+    :param F: np.array of functions with shape (n,) - the n basis functions
    :type F: np.array
    :param G: np.array with shape (n,n) - the Gram matrix associated to F
    :type G: np.array
    :param variances: np.ndarray with shape (n,)
    :type: np.ndarray
    """
    part = parts.finite_dimensional.FiniteDimensional(input_dim, F, G, variances, weights)
    return kern(input_dim, [part])
@ -220,6 +265,7 @@ def spline(input_dim, variance=1.):
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    """
    part = parts.spline.Spline(input_dim, variance)
    return kern(input_dim, [part])
@ -232,43 +278,78 @@ def Brownian(input_dim, variance=1.):
    :type input_dim: int
    :param variance: the variance of the kernel
    :type variance: float
    """
    part = parts.Brownian.Brownian(input_dim, variance)
    return kern(input_dim, [part])
 try:
    import sympy as sp
    from sympykern import spkern
    from sympy.parsing.sympy_parser import parse_expr
    sympy_available = True
 except ImportError:
    sympy_available = False
 if sympy_available:
    from parts.sympykern import spkern
    from sympy.parsing.sympy_parser import parse_expr
    from GPy.util.symbolic import sinc
    def rbf_sympy(input_dim, ARD=False, variance=1., lengthscale=1.):
        """
        Radial Basis Function covariance.
        """
        X = [sp.var('x%i' % i) for i in range(input_dim)]
        Z = [sp.var('z%i' % i) for i in range(input_dim)]
-        rbf_variance = sp.var('rbf_variance',positive=True)
+        variance = sp.var('variance',positive=True)
        if ARD:
-            rbf_lengthscales = [sp.var('rbf_lengthscale_%i' % i, positive=True) for i in range(input_dim)]
+            lengthscales = [sp.var('lengthscale_%i' % i, positive=True) for i in range(input_dim)]
-            dist_string = ' + '.join(['(x%i-z%i)**2/rbf_lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
+            dist_string = ' + '.join(['(x%i-z%i)**2/lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
            dist = parse_expr(dist_string)
-            f =  rbf_variance*sp.exp(-dist/2.)
+            f =  variance*sp.exp(-dist/2.)
        else:
-            rbf_lengthscale = sp.var('rbf_lengthscale',positive=True)
+            lengthscale = sp.var('lengthscale',positive=True)
            dist_string = ' + '.join(['(x%i-z%i)**2' % (i, i) for i in range(input_dim)])
            dist = parse_expr(dist_string)
-            f =  rbf_variance*sp.exp(-dist/(2*rbf_lengthscale**2))
+            f =  variance*sp.exp(-dist/(2*lengthscale**2))
-        return kern(input_dim, [spkern(input_dim, f)])
+        return kern(input_dim, [spkern(input_dim, f, name='rbf_sympy')])
-    def sympykern(input_dim, k):
+    def sinc(input_dim, ARD=False, variance=1., lengthscale=1.):
        """
-        A kernel from a symbolic sympy representation
+        TODO: Not clear why this isn't working, suggests argument of sinc is not a number.
        sinc covariance funciton
        """
-        return kern(input_dim, [spkern(input_dim, k)])
+        X = [sp.var('x%i' % i) for i in range(input_dim)]
        Z = [sp.var('z%i' % i) for i in range(input_dim)]
        variance = sp.var('variance',positive=True)
        if ARD:
            lengthscales = [sp.var('lengthscale_%i' % i, positive=True) for i in range(input_dim)]
            dist_string = ' + '.join(['(x%i-z%i)**2/lengthscale_%i**2' % (i, i, i) for i in range(input_dim)])
            dist = parse_expr(dist_string)
            f =  variance*sinc(sp.pi*sp.sqrt(dist))
        else:
            lengthscale = sp.var('lengthscale',positive=True)
            dist_string = ' + '.join(['(x%i-z%i)**2' % (i, i) for i in range(input_dim)])
            dist = parse_expr(dist_string)
            f =  variance*sinc(sp.pi*sp.sqrt(dist)/lengthscale)
        return kern(input_dim, [spkern(input_dim, f, name='sinc')])
    def sympykern(input_dim, k,name=None):
        """
        A base kernel object, where all the hard work in done by sympy.
        :param k: the covariance function
        :type k: a positive definite sympy function of x1, z1, x2, z2...
        To construct a new sympy kernel, you'll need to define:
         - a kernel function using a sympy object. Ensure that the kernel is of the form k(x,z).
         - that's it! we'll extract the variables from the function k.
        Note:
         - to handle multiple inputs, call them x1, z1, etc
         - to handle multpile correlated outputs, you'll need to define each covariance function and 'cross' variance function. TODO
        """
        return kern(input_dim, [spkern(input_dim, k,name)])
 del sympy_available
 def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 * np.pi, n_freq=10, lower=0., upper=4 * np.pi):
@ -285,6 +366,7 @@ def periodic_exponential(input_dim=1, variance=1., lengthscale=None, period=2 *
    :type period: float
    :param n_freq: the number of frequencies considered for the periodic subspace
    :type n_freq: int
    """
    part = parts.periodic_exponential.PeriodicExponential(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
@ -303,6 +385,7 @@ def periodic_Matern32(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
    """
    part = parts.periodic_Matern32.PeriodicMatern32(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
@ -321,6 +404,7 @@ def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi
     :type period: float
     :param n_freq: the number of frequencies considered for the periodic subspace
     :type n_freq: int
    """
    part = parts.periodic_Matern52.PeriodicMatern52(input_dim, variance, lengthscale, period, n_freq, lower, upper)
    return kern(input_dim, [part])
@ -334,6 +418,7 @@ def prod(k1,k2,tensor=False):
    :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
    :type tensor: Boolean
    :rtype: kernel object
    """
    part = parts.prod.Prod(k1, k2, tensor)
    return kern(part.input_dim, [part])
@ -346,30 +431,32 @@ def symmetric(k):
    k_.parts = [symmetric.Symmetric(p) for p in k.parts]
    return k_
-def coregionalize(num_outputs,W_columns=1, W=None, kappa=None):
+def coregionalize(output_dim,rank=1, W=None, kappa=None):
    """
    Coregionlization matrix B, of the form:
    .. math::
       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
-    An intrinsic/linear coregionalization kernel of the form
+    An intrinsic/linear coregionalization kernel of the form:
    .. math::
       k_2(x, y)=\mathbf{B} k(x, y)
    it is obtainded as the tensor product between a kernel k(x,y) and B.
-    :param num_outputs: the number of outputs to coregionalize
+    :param output_dim: the number of outputs to corregionalize
-    :type num_outputs: int
+    :type output_dim: int
-    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :param rank: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
-    :type W_colunns: int
+    :type rank: int
    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
-    :type W: numpy array of dimensionality (num_outpus, W_columns)
+    :type W: numpy array of dimensionality (num_outpus, rank)
    :param kappa: a vector which allows the outputs to behave independently
-    :type kappa: numpy array of dimensionality  (num_outputs,)
+    :type kappa: numpy array of dimensionality  (output_dim,)
    :rtype: kernel object
    """
-    p = parts.coregionalize.Coregionalize(num_outputs,W_columns,W,kappa)
+    p = parts.coregionalize.Coregionalize(output_dim,rank,W,kappa)
    return kern(1,[p])
@ -422,25 +509,26 @@ def independent_outputs(k):
 def hierarchical(k):
    """
-    TODO THis can't be right! Construct a kernel with independent outputs from an existing kernel
+    TODO This can't be right! Construct a kernel with independent outputs from an existing kernel
    """
    # for sl in k.input_slices:
    #     assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
    _parts = [parts.hierarchical.Hierarchical(k.parts)]
    return kern(k.input_dim+len(k.parts),_parts)
-def build_lcm(input_dim, num_outputs, kernel_list = [], W_columns=1,W=None,kappa=None):
+def build_lcm(input_dim, output_dim, kernel_list = [], rank=1,W=None,kappa=None):
    """
    Builds a kernel of a linear coregionalization model
    :input_dim: Input dimensionality
-    :num_outputs: Number of outputs
+    :output_dim: Number of outputs
    :kernel_list: List of coregionalized kernels, each element in the list will be multiplied by a different corregionalization matrix
    :type kernel_list: list of GPy kernels
-    :param W_columns: number tuples of the corregionalization parameters 'coregion_W'
+    :param rank: number tuples of the corregionalization parameters 'coregion_W'
-    :type W_columns: integer
+    :type rank: integer
    ..note the kernels dimensionality is overwritten to fit input_dim
    ..Note the kernels dimensionality is overwritten to fit input_dim
    """
    for k in kernel_list:
@ -448,11 +536,31 @@ def build_lcm(input_dim, num_outputs, kernel_list = [], W_columns=1,W=None,kappa
            k.input_dim = input_dim
            warnings.warn("kernel's input dimension overwritten to fit input_dim parameter.")
-    k_coreg = coregionalize(num_outputs,W_columns,W,kappa)
+    k_coreg = coregionalize(output_dim,rank,W,kappa)
    kernel = kernel_list[0]**k_coreg.copy()
    for k in kernel_list[1:]:
-        k_coreg = coregionalize(num_outputs,W_columns,W,kappa)
+        k_coreg = coregionalize(output_dim,rank,W,kappa)
        kernel += k**k_coreg.copy()
    return kernel
 def ODE_1(input_dim=1, varianceU=1.,  varianceY=1., lengthscaleU=None,  lengthscaleY=None):
    """
    kernel resultiong from a first order ODE with OU driving GP
    :param input_dim: the number of input dimension, has to be equal to one
    :type input_dim: int
    :param varianceU: variance of the driving GP
    :type varianceU: float
    :param lengthscaleU: lengthscale of the driving GP
    :type lengthscaleU: float
    :param varianceY: 'variance' of the transfer function
    :type varianceY: float
    :param lengthscaleY: 'lengthscale' of the transfer function
    :type lengthscaleY: float
    :rtype: kernel object
    """
    part = parts.ODE_1.ODE_1(input_dim, varianceU, varianceY, lengthscaleU, lengthscaleY)
    return kern(input_dim, [part])
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -1,6 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import sys
 import numpy as np
 import pylab as pb
 from ..core.parameterized import Parameterized
@ -79,13 +80,15 @@ class kern(Parameterized):
    def plot_ARD(self, fignum=None, ax=None, title='', legend=False):
-        """If an ARD kernel is present, it bar-plots the ARD parameters,
+        """If an ARD kernel is present, it bar-plots the ARD parameters.
        :param fignum: figure number of the plot
        :param ax: matplotlib axis to plot on
        :param title: 
            title of the plot, 
            pass '' to not print a title
            pass None for a generic title
        """
        if ax is None:
            fig = pb.figure(fignum)
@ -176,8 +179,10 @@ class kern(Parameterized):
    def add(self, other, tensor=False):
        """
        Add another kernel to this one. Both kernels are defined on the same _space_
        :param other: the other kernel to be added
        :type other: GPy.kern
        """
        if tensor:
            D = self.input_dim + other.input_dim
@ -219,11 +224,13 @@ class kern(Parameterized):
    def prod(self, other, tensor=False):
        """
-        multiply two kernels (either on the same space, or on the tensor product of the input space).
+        Multiply two kernels (either on the same space, or on the tensor product of the input space).
        :param other: the other kernel to be added
        :type other: GPy.kern
        :param tensor: whether or not to use the tensor space (default is false).
        :type tensor: bool 
        """
        K1 = self.copy()
        K2 = other.copy()
@ -322,6 +329,7 @@ class kern(Parameterized):
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)
        """
        assert X.shape[1] == self.input_dim
        target = np.zeros(self.num_params)
@ -341,6 +349,7 @@ class kern(Parameterized):
        :type X: np.ndarray (num_samples x input_dim)
        :param X2: Observed data inputs (optional, defaults to X)
        :type X2: np.ndarray (num_inducing x input_dim)"""
        target = np.zeros_like(X)
        if X2 is None: 
            [p.dK_dX(dL_dK, X[:, i_s], None, target[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
@ -414,6 +423,7 @@ class kern(Parameterized):
        :param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
        :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
        :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
        """
        target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
        [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
@ -568,7 +578,7 @@ class Kern_check_model(Model):
    def is_positive_definite(self):
        v = np.linalg.eig(self.kernel.K(self.X))[0]
-        if any(v<0):
+        if any(v<-10*sys.float_info.epsilon):
            return False
        else:
            return True
@ -657,6 +667,7 @@ def kern_test(kern, X=None, X2=None, verbose=False):
    :type X: ndarray
    :param X2: X2 input values to test the covariance function.
    :type X2: ndarray
    """
    pass_checks = True
    if X==None:
@ -683,7 +694,7 @@ def kern_test(kern, X=None, X2=None, verbose=False):
        Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=True)
        pass_checks = False
        return False
-    
+
    if verbose:
        print("Checking gradients of K(X, X2) wrt theta.")
    result = Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose)
@ -694,7 +705,7 @@ def kern_test(kern, X=None, X2=None, verbose=False):
        Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=True)
        pass_checks = False
        return False
-    
+
    if verbose:
        print("Checking gradients of Kdiag(X) wrt theta.")
    result = Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose)
@ -705,10 +716,15 @@ def kern_test(kern, X=None, X2=None, verbose=False):
        Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=True)
        pass_checks = False
        return False
-        
+
    if verbose:
        print("Checking gradients of K(X, X) wrt X.")
-    result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
+    try:
        result = Kern_check_dK_dX(kern, X=X, X2=None).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("dK_dX not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
@ -719,7 +735,12 @@ def kern_test(kern, X=None, X2=None, verbose=False):
    if verbose:
        print("Checking gradients of K(X, X2) wrt X.")
-    result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
+    try:
        result = Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("dK_dX not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
@ -730,7 +751,12 @@ def kern_test(kern, X=None, X2=None, verbose=False):
    if verbose:
        print("Checking gradients of Kdiag(X) wrt X.")
-    result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
+    try:
        result = Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)
    except NotImplementedError:
        result=True
        if verbose:
            print("dK_dX not implemented for " + kern.name)
    if result and verbose:
        print("Check passed.")
    if not result:
@ -738,5 +764,5 @@ def kern_test(kern, X=None, X2=None, verbose=False):
        Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=True)
        pass_checks = False
        return False
-    
+
    return pass_checks
--- a/GPy/kern/parts/ODE_1.py
+++ b/GPy/kern/parts/ODE_1.py
@ -0,0 +1,161 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 class ODE_1(Kernpart):
    """
    kernel resultiong from a first order ODE with OU driving GP
    :param input_dim: the number of input dimension, has to be equal to one
    :type input_dim: int
    :param varianceU: variance of the driving GP
    :type varianceU: float
    :param lengthscaleU: lengthscale of the driving GP  (sqrt(3)/lengthscaleU)
    :type lengthscaleU: float
    :param varianceY: 'variance' of the transfer function
    :type varianceY: float
    :param lengthscaleY: 'lengthscale' of the transfer function (1/lengthscaleY)
    :type lengthscaleY: float
    :rtype: kernel object
    """
    def __init__(self, input_dim=1, varianceU=1., varianceY=1., lengthscaleU=None, lengthscaleY=None):
        assert input_dim==1, "Only defined for input_dim = 1"
        self.input_dim = input_dim
        self.num_params = 4
        self.name = 'ODE_1'
        if lengthscaleU is not None:
            lengthscaleU = np.asarray(lengthscaleU)
            assert lengthscaleU.size == 1, "lengthscaleU should be one dimensional"
        else:
            lengthscaleU = np.ones(1)
        if lengthscaleY is not None:
            lengthscaleY = np.asarray(lengthscaleY)
            assert lengthscaleY.size == 1, "lengthscaleY should be one dimensional"
        else:
            lengthscaleY = np.ones(1)
            #lengthscaleY = 0.5
        self._set_params(np.hstack((varianceU, varianceY, lengthscaleU,lengthscaleY)))
    def _get_params(self):
        """return the value of the parameters."""
        return np.hstack((self.varianceU,self.varianceY, self.lengthscaleU,self.lengthscaleY))
    def _set_params(self, x):
        """set the value of the parameters."""
        assert x.size == self.num_params
        self.varianceU = x[0]
        self.varianceY = x[1]
        self.lengthscaleU = x[2]
        self.lengthscaleY = x[3]
    def _get_param_names(self):
        """return parameter names."""
        return ['varianceU','varianceY', 'lengthscaleU', 'lengthscaleY']
    def K(self, X, X2, target):
        """Compute the covariance matrix between X and X2."""
        if X2 is None: X2 = X
       # i1 = X[:,1]
       # i2 = X2[:,1]
       # X = X[:,0].reshape(-1,1)
       # X2 = X2[:,0].reshape(-1,1)
        dist = np.abs(X - X2.T)
        ly=1/self.lengthscaleY
        lu=np.sqrt(3)/self.lengthscaleU
        #ly=self.lengthscaleY
        #lu=self.lengthscaleU
        k1 = np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
        k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
    def Kdiag(self, X, target):
        """Compute the diagonal of the covariance matrix associated to X."""
        ly=1/self.lengthscaleY
        lu=np.sqrt(3)/self.lengthscaleU
        #ly=self.lengthscaleY
        #lu=self.lengthscaleU
        k1 = (2*lu+ly)/(lu+ly)**2
        k2 = (ly-2*lu + 2*lu-ly ) / (ly-lu)**2 
        k3 = 1/(lu+ly) + (lu)/(lu+ly)**2 
        np.add(self.varianceU*self.varianceY*(k1+k2+k3), target, target)
    def dK_dtheta(self, dL_dK, X, X2, target):
        """derivative of the covariance matrix with respect to the parameters."""
        if X2 is None: X2 = X
        dist = np.abs(X - X2.T)
        ly=1/self.lengthscaleY
        lu=np.sqrt(3)/self.lengthscaleU
        #ly=self.lengthscaleY
        #lu=self.lengthscaleU
        dk1theta1 = np.exp(-ly*dist)*2*(-lu)/(lu+ly)**3
        #c=np.sqrt(3)
        #t1=c/lu
        #t2=1/ly
        #dk1theta1=np.exp(-dist*ly)*t2*( (2*c*t2+2*t1)/(c*t2+t1)**2 -2*(2*c*t2*t1+t1**2)/(c*t2+t1)**3   )
        dk2theta1 = 1*( 
            np.exp(-lu*dist)*dist*(-ly+2*lu-lu*ly*dist+dist*lu**2)*(ly-lu)**(-2) + np.exp(-lu*dist)*(-2+ly*dist-2*dist*lu)*(ly-lu)**(-2) 
            +np.exp(-dist*lu)*(ly-2*lu+ly*lu*dist-dist*lu**2)*2*(ly-lu)**(-3) 
            +np.exp(-dist*ly)*2*(ly-lu)**(-2)
            +np.exp(-dist*ly)*2*(2*lu-ly)*(ly-lu)**(-3)
            )
        dk3theta1 = np.exp(-dist*lu)*(lu+ly)**(-2)*((2*lu+ly+dist*lu**2+lu*ly*dist)*(-dist-2/(lu+ly))+2+2*lu*dist+ly*dist)
        dktheta1 = self.varianceU*self.varianceY*(dk1theta1+dk2theta1+dk3theta1)
        dk1theta2 = np.exp(-ly*dist) * ((lu+ly)**(-2)) * (  (-dist)*(2*lu+ly)  +  1  +  (-2)*(2*lu+ly)/(lu+ly)  )
        dk2theta2 = 1*(
            np.exp(-dist*lu)*(ly-lu)**(-2) * ( 1+lu*dist+(-2)*(ly-2*lu+lu*ly*dist-dist*lu**2)*(ly-lu)**(-1) )
            +np.exp(-dist*ly)*(ly-lu)**(-2) * ( (-dist)*(2*lu-ly) -1+(2*lu-ly)*(-2)*(ly-lu)**(-1) )
            )
        dk3theta2 = np.exp(-dist*lu) * (-3*lu-ly-dist*lu**2-lu*ly*dist)/(lu+ly)**3
        dktheta2 = self.varianceU*self.varianceY*(dk1theta2 + dk2theta2 +dk3theta2)
        k1 = np.exp(-ly*dist)*(2*lu+ly)/(lu+ly)**2
        k2 = (np.exp(-lu*dist)*(ly-2*lu+lu*ly*dist-lu**2*dist) + np.exp(-ly*dist)*(2*lu-ly) ) / (ly-lu)**2 
        k3 = np.exp(-lu*dist) * ( (1+lu*dist)/(lu+ly) + (lu)/(lu+ly)**2 )
        dkdvar = k1+k2+k3
        target[0] += np.sum(self.varianceY*dkdvar * dL_dK)
        target[1] += np.sum(self.varianceU*dkdvar * dL_dK)
        target[2] += np.sum(dktheta1*(-np.sqrt(3)*self.lengthscaleU**(-2)) * dL_dK)
        target[3] += np.sum(dktheta2*(-self.lengthscaleY**(-2)) * dL_dK)
    # def dKdiag_dtheta(self, dL_dKdiag, X, target):
    #     """derivative of the diagonal of the covariance matrix with respect to the parameters."""
    #     # NB: derivative of diagonal elements wrt lengthscale is 0
    #     target[0] += np.sum(dL_dKdiag)
    # def dK_dX(self, dL_dK, X, X2, target):
    #     """derivative of the covariance matrix with respect to X."""
    #     if X2 is None: X2 = X
    #     dist = np.sqrt(np.sum(np.square((X[:, None, :] - X2[None, :, :]) / self.lengthscale), -1))[:, :, None]
    #     ddist_dX = (X[:, None, :] - X2[None, :, :]) / self.lengthscale ** 2 / np.where(dist != 0., dist, np.inf)
    #     dK_dX = -np.transpose(self.variance * np.exp(-dist) * ddist_dX, (1, 0, 2))
    #     target += np.sum(dK_dX * dL_dK.T[:, :, None], 0)
    # def dKdiag_dX(self, dL_dKdiag, X, target):
    #     pass
--- a/GPy/kern/parts/init.py
+++ b/GPy/kern/parts/init.py
@ -2,16 +2,18 @@ import bias
 import Brownian
 import coregionalize
 import exponential
 import eq_ode1
 import finite_dimensional
 import fixed
 import gibbs
-#import hetero #hetero.py is not commited: omitting for now. JH. 
+import hetero
 import hierarchical
 import independent_outputs
 import linear
 import Matern32
 import Matern52
 import mlp
 import ODE_1
 import periodic_exponential
 import periodic_Matern32
 import periodic_Matern52
--- a/GPy/kern/parts/coregionalize.py
+++ b/GPy/kern/parts/coregionalize.py
@ -11,44 +11,47 @@ class Coregionalize(Kernpart):
    """
    Covariance function for intrinsic/linear coregionalization models
-    This covariance has the form
+    This covariance has the form:
    .. math::
-       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I}
+       \mathbf{B} = \mathbf{W}\mathbf{W}^\top + \text{diag}(kappa)
-    An intrinsic/linear coregionalization covariance function of the form
+    An intrinsic/linear coregionalization covariance function of the form:
    .. math::
       k_2(x, y)=\mathbf{B} k(x, y)
    it is obtained as the tensor product between a covariance function
    k(x,y) and B.
-    :param num_outputs: number of outputs to coregionalize
+    :param output_dim: number of outputs to coregionalize
-    :type num_outputs: int
+    :type output_dim: int
-    :param W_columns: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
+    :param rank: number of columns of the W matrix (this parameter is ignored if parameter W is not None)
-    :type W_colunns: int
+    :type rank: int
    :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalization matrix B
    :type W: numpy array of dimensionality (num_outpus, W_columns)
    :param kappa: a vector which allows the outputs to behave independently
-    :type kappa: numpy array of dimensionality  (num_outputs,)
+    :type kappa: numpy array of dimensionality  (output_dim,)
-    .. Note: see coregionalization examples in GPy.examples.regression for some usage.
+    .. note: see coregionalization examples in GPy.examples.regression for some usage.
    """
-    def __init__(self,num_outputs,W_columns=1, W=None, kappa=None):
+    def __init__(self, output_dim, rank=1, W=None, kappa=None):
        self.input_dim = 1
        self.name = 'coregion'
-        self.num_outputs = num_outputs
+        self.output_dim = output_dim
-        self.W_columns = W_columns
+        self.rank = rank
        if self.rank>output_dim-1:
            print("Warning: Unusual choice of rank, it should normally be less than the output_dim.")
        if W is None:
-            self.W = 0.5*np.random.randn(self.num_outputs,self.W_columns)/np.sqrt(self.W_columns)
+            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
        else:
-            assert W.shape==(self.num_outputs,self.W_columns)
+            assert W.shape==(self.output_dim,self.rank)
            self.W = W
        if kappa is None:
-            kappa = 0.5*np.ones(self.num_outputs)
+            kappa = 0.5*np.ones(self.output_dim)
        else:
-            assert kappa.shape==(self.num_outputs,)
+            assert kappa.shape==(self.output_dim,)
        self.kappa = kappa
-        self.num_params = self.num_outputs*(self.W_columns + 1)
+        self.num_params = self.output_dim*(self.rank + 1)
        self._set_params(np.hstack([self.W.flatten(),self.kappa]))
    def _get_params(self):
@ -56,12 +59,12 @@ class Coregionalize(Kernpart):
    def _set_params(self,x):
        assert x.size == self.num_params
-        self.kappa = x[-self.num_outputs:]
+        self.kappa = x[-self.output_dim:]
-        self.W = x[:-self.num_outputs].reshape(self.num_outputs,self.W_columns)
+        self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank)
        self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa)
    def _get_param_names(self):
-        return sum([['W%i_%i'%(i,j) for j in range(self.W_columns)] for i in range(self.num_outputs)],[]) + ['kappa_%i'%i for i in range(self.num_outputs)]
+        return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)]
    def K(self,index,index2,target):
        index = np.asarray(index,dtype=np.int)
@ -79,26 +82,26 @@ class Coregionalize(Kernpart):
        if index2 is None:
            code="""
            for(int i=0;i<N; i++){
-              target[i+i*N] += B[index[i]+num_outputs*index[i]];
+              target[i+i*N] += B[index[i]+output_dim*index[i]];
              for(int j=0; j<i; j++){
-                  target[j+i*N] += B[index[i]+num_outputs*index[j]];
+                  target[j+i*N] += B[index[i]+output_dim*index[j]];
                  target[i+j*N] += target[j+i*N];
                }
              }
            """
-            N,B,num_outputs = index.size, self.B, self.num_outputs
+            N,B,output_dim = index.size, self.B, self.output_dim
-            weave.inline(code,['target','index','N','B','num_outputs'])
+            weave.inline(code,['target','index','N','B','output_dim'])
        else:
            index2 = np.asarray(index2,dtype=np.int)
            code="""
            for(int i=0;i<num_inducing; i++){
              for(int j=0; j<N; j++){
-                  target[i+j*num_inducing] += B[num_outputs*index[j]+index2[i]];
+                  target[i+j*num_inducing] += B[output_dim*index[j]+index2[i]];
                }
              }
            """
-            N,num_inducing,B,num_outputs = index.size,index2.size, self.B, self.num_outputs
+            N,num_inducing,B,output_dim = index.size,index2.size, self.B, self.output_dim
-            weave.inline(code,['target','index','index2','N','num_inducing','B','num_outputs'])
+            weave.inline(code,['target','index','index2','N','num_inducing','B','output_dim'])
    def Kdiag(self,index,target):
@ -115,12 +118,12 @@ class Coregionalize(Kernpart):
        code="""
        for(int i=0; i<num_inducing; i++){
          for(int j=0; j<N; j++){
-            dL_dK_small[index[j] + num_outputs*index2[i]] += dL_dK[i+j*num_inducing];
+            dL_dK_small[index[j] + output_dim*index2[i]] += dL_dK[i+j*num_inducing];
          }
        }
        """
-        N, num_inducing, num_outputs = index.size, index2.size, self.num_outputs
+        N, num_inducing, output_dim = index.size, index2.size, self.output_dim
-        weave.inline(code, ['N','num_inducing','num_outputs','dL_dK','dL_dK_small','index','index2'])
+        weave.inline(code, ['N','num_inducing','output_dim','dL_dK','dL_dK_small','index','index2'])
        dkappa = np.diag(dL_dK_small)
        dL_dK_small += dL_dK_small.T
@ -137,8 +140,8 @@ class Coregionalize(Kernpart):
        ii,jj = ii.T, jj.T
        dL_dK_small = np.zeros_like(self.B)
-        for i in range(self.num_outputs):
+        for i in range(self.output_dim):
-            for j in range(self.num_outputs):
+            for j in range(self.output_dim):
                tmp = np.sum(dL_dK[(ii==i)*(jj==j)])
                dL_dK_small[i,j] = tmp
@ -150,8 +153,8 @@ class Coregionalize(Kernpart):
    def dKdiag_dtheta(self,dL_dKdiag,index,target):
        index = np.asarray(index,dtype=np.int).flatten()
-        dL_dKdiag_small = np.zeros(self.num_outputs)
+        dL_dKdiag_small = np.zeros(self.output_dim)
-        for i in range(self.num_outputs):
+        for i in range(self.output_dim):
            dL_dKdiag_small[i] += np.sum(dL_dKdiag[index==i])
        dW = 2.*self.W*dL_dKdiag_small[:,None]
        dkappa = dL_dKdiag_small
--- a/GPy/kern/parts/eq_ode1.py
+++ b/GPy/kern/parts/eq_ode1.py
@ -0,0 +1,556 @@
 # Copyright (c) 2013, GPy Authors, see AUTHORS.txt
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 from kernpart import Kernpart
 import numpy as np
 from GPy.util.linalg import mdot, pdinv
 from GPy.util.ln_diff_erfs import ln_diff_erfs
 import pdb
 from scipy import weave
 class Eq_ode1(Kernpart):
    """
    Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
    This outputs of this kernel have the form
    .. math::
       \frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} f_i(t-\delta_j) +\sqrt{\kappa_j}g_j(t) - d_jy_j(t)
    where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`f_i(t)` and :math:`g_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
    :param output_dim: number of outputs driven by latent function.
    :type output_dim: int
    :param W: sensitivities of each output to the latent driving function. 
    :type W: ndarray (output_dim x rank).
    :param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
    :type rank: int
    :param decay: decay rates for the first order system. 
    :type decay: array of length output_dim.
    :param delay: delay between latent force and output response.
    :type delay: array of length output_dim.
    :param kappa: diagonal term that allows each latent output to have an independent component to the response.
    :type kappa: array of length output_dim.
    .. Note: see first order differential equation examples in GPy.examples.regression for some usage.
    """
    def __init__(self,output_dim, W=None, rank=1, kappa=None, lengthscale=1.0,  decay=None, delay=None):
        self.rank = rank
        self.input_dim = 1
        self.name = 'eq_ode1'
        self.output_dim = output_dim
        self.lengthscale = lengthscale
        self.num_params = self.output_dim*self.rank + 1 + (self.output_dim - 1)
        if kappa is not None:
            self.num_params+=self.output_dim
        if delay is not None:
            assert delay.shape==(self.output_dim-1,)
            self.num_params+=self.output_dim-1
        self.rank = rank
        if W is None:
            self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank)
        else:
            assert W.shape==(self.output_dim,self.rank)
            self.W = W
        if decay is None:
            self.decay = np.ones(self.output_dim-1)
        if kappa is not None:
            assert kappa.shape==(self.output_dim,)
        self.kappa = kappa
        self.delay = delay
        self.is_normalized = True
        self.is_stationary = False
        self.gaussian_initial = False
        self._set_params(self._get_params())
    def _get_params(self):
        param_list = [self.W.flatten()]
        if self.kappa is not None:
            param_list.append(self.kappa)
        param_list.append(self.decay)
        if self.delay is not None:
            param_list.append(self.delay)
        param_list.append(self.lengthscale)
        return np.hstack(param_list)
    def _set_params(self,x):
        assert x.size == self.num_params
        end = self.output_dim*self.rank
        self.W = x[:end].reshape(self.output_dim,self.rank)
        start = end
        self.B = np.dot(self.W,self.W.T)
        if self.kappa is not None:
            end+=self.output_dim
            self.kappa = x[start:end]
            self.B += np.diag(self.kappa)
            start=end
        end+=self.output_dim-1
        self.decay = x[start:end]
        start=end
        if self.delay is not None:
            end+=self.output_dim-1
            self.delay = x[start:end]
            start=end
        end+=1
        self.lengthscale = x[start]
        self.sigma = np.sqrt(2)*self.lengthscale
    def _get_param_names(self):
        param_names = sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[])
        if self.kappa is not None:
            param_names += ['kappa_%i'%i for i in range(self.output_dim)]
        param_names += ['decay_%i'%i for i in range(1,self.output_dim)]
        if self.delay is not None:
            param_names += ['delay_%i'%i for i in 1+range(1,self.output_dim)]
        param_names+= ['lengthscale'] 
        return param_names
    def K(self,X,X2,target):
        if X.shape[1] > 2:
            raise ValueError('Input matrix for ode1 covariance should have at most two columns, one containing times, the other output indices')
        self._K_computations(X, X2)
        target += self._scale*self._K_dvar
        if self.gaussian_initial:
            # Add covariance associated with initial condition.
            t1_mat = self._t[self._rorder, None]
            t2_mat = self._t2[None, self._rorder2]
            target+=self.initial_variance * np.exp(- self.decay * (t1_mat + t2_mat))
    def Kdiag(self,index,target):
        #target += np.diag(self.B)[np.asarray(index,dtype=np.int).flatten()]
        pass
    def dK_dtheta(self,dL_dK,X,X2,target):
        # First extract times and indices.
        self._extract_t_indices(X, X2, dL_dK=dL_dK)
        self._dK_ode_dtheta(target)
    def _dK_ode_dtheta(self, target):
        """Do all the computations for the ode parts of the covariance function."""
        t_ode = self._t[self._index>0]
        dL_dK_ode = self._dL_dK[self._index>0, :]
        index_ode = self._index[self._index>0]-1
        if self._t2 is None:
            if t_ode.size==0:
                return        
            t2_ode = t_ode
            dL_dK_ode = dL_dK_ode[:, self._index>0]
            index2_ode = index_ode
        else:
            t2_ode = self._t2[self._index2>0]
            dL_dK_ode = dL_dK_ode[:, self._index2>0]
            if t_ode.size==0 or t2_ode.size==0:
                return
            index2_ode = self._index2[self._index2>0]-1
        h1 = self._compute_H(t_ode, index_ode, t2_ode, index2_ode, stationary=self.is_stationary, update_derivatives=True)
        #self._dK_ddelay = self._dh_ddelay
        self._dK_dsigma = self._dh_dsigma
        if self._t2 is None:
            h2 = h1
        else:
            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary, update_derivatives=True)
        #self._dK_ddelay += self._dh_ddelay.T
        self._dK_dsigma += self._dh_dsigma.T
        # C1 = self.sensitivity
        # C2 = self.sensitivity
        # K = 0.5 * (h1 + h2.T)
        # var2 = C1*C2
        # if self.is_normalized:
        #     dk_dD1 = (sum(sum(dL_dK.*dh1_dD1)) + sum(sum(dL_dK.*dh2_dD1.T)))*0.5*var2
        #     dk_dD2 = (sum(sum(dL_dK.*dh1_dD2)) + sum(sum(dL_dK.*dh2_dD2.T)))*0.5*var2
        #     dk_dsigma = 0.5 * var2 * sum(sum(dL_dK.*dK_dsigma))
        #     dk_dC1 = C2 * sum(sum(dL_dK.*K))
        #     dk_dC2 = C1 * sum(sum(dL_dK.*K))
        # else:
        #     K = np.sqrt(np.pi) * K
        #     dk_dD1 = (sum(sum(dL_dK.*dh1_dD1)) + * sum(sum(dL_dK.*K))
        #     dk_dC2 = self.sigma * C1 * sum(sum(dL_dK.*K))
        # dk_dSim1Variance = dk_dC1
        # Last element is the length scale.
        (dL_dK_ode[:, :, None]*self._dh_ddelay[:, None, :]).sum(2)
        target[-1] += (dL_dK_ode*self._dK_dsigma/np.sqrt(2)).sum()
        # # only pass the gradient with respect to the inverse width to one
        # # of the gradient vectors ... otherwise it is counted twice.
        # g1 = real([dk_dD1 dk_dinvWidth dk_dSim1Variance])
        # g2 = real([dk_dD2 0 dk_dSim2Variance])
        # return g1, g2"""
    def dKdiag_dtheta(self,dL_dKdiag,index,target):
        pass
    def dK_dX(self,dL_dK,X,X2,target):
        pass
    def _extract_t_indices(self, X, X2=None, dL_dK=None):
        """Extract times and output indices from the input matrix X. Times are ordered according to their index for convenience of computation, this ordering is stored in self._order and self.order2. These orderings are then mapped back to the original ordering (in X) using self._rorder and self._rorder2. """
        # TODO: some fast checking here to see if this needs recomputing?
        self._t = X[:, 0]
        if not X.shape[1] == 2:
            raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
        self._index = np.asarray(X[:, 1],dtype=np.int)
        # Sort indices so that outputs are in blocks for computational
        # convenience.
        self._order = self._index.argsort()
        self._index = self._index[self._order]
        self._t = self._t[self._order]
        self._rorder = self._order.argsort() # rorder is for reversing the order
        if X2 is None:
            self._t2 = None
            self._index2 = None
            self._order2 = self._order
            self._rorder2 = self._rorder
        else:
            if not X2.shape[1] == 2:
                raise ValueError('Input matrix for ode1 covariance should have two columns, one containing times, the other output indices')
            self._t2 = X2[:, 0]
            self._index2 = np.asarray(X2[:, 1],dtype=np.int)
            self._order2 = self._index2.argsort()
            self._index2 = self._index2[self._order2]
            self._t2 = self._t2[self._order2]
            self._rorder2 = self._order2.argsort() # rorder2 is for reversing order
        if dL_dK is not None:
            self._dL_dK = dL_dK[self._order, :]
            self._dL_dK = self._dL_dK[:, self._order2]
    def _K_computations(self, X, X2):
        """Perform main body of computations for the ode1 covariance function."""
        # First extract times and indices.
        self._extract_t_indices(X, X2)
        self._K_compute_eq()
        self._K_compute_ode_eq()
        if X2 is None:
            self._K_eq_ode = self._K_ode_eq.T
        else:
            self._K_compute_ode_eq(transpose=True)
        self._K_compute_ode()
        if X2 is None:
            self._K_dvar = np.zeros((self._t.shape[0], self._t.shape[0]))
        else:
            self._K_dvar = np.zeros((self._t.shape[0], self._t2.shape[0]))
        # Reorder values of blocks for placing back into _K_dvar.
        self._K_dvar = np.vstack((np.hstack((self._K_eq, self._K_eq_ode)),
                                                   np.hstack((self._K_ode_eq, self._K_ode))))
        self._K_dvar = self._K_dvar[self._rorder, :]
        self._K_dvar = self._K_dvar[:, self._rorder2]
        if X2 is None:
            # Matrix giving scales of each output
            self._scale = np.zeros((self._t.size, self._t.size))
            code="""
            for(int i=0;i<N; i++){
              scale_mat[i+i*N] = B[index[i]+output_dim*(index[i])];
              for(int j=0; j<i; j++){
                  scale_mat[j+i*N] = B[index[i]+output_dim*index[j]];
                  scale_mat[i+j*N] = scale_mat[j+i*N];
                }
              }
            """
            scale_mat, B, index = self._scale, self.B, self._index
            N, output_dim = self._t.size, self.output_dim
            weave.inline(code,['index',
                               'scale_mat', 'B',
                               'N', 'output_dim'])
        else:
            self._scale = np.zeros((self._t.size, self._t2.size))
            code = """
            for(int i=0; i<N; i++){
              for(int j=0; j<N2; j++){
                scale_mat[i+j*N] = B[index[i]+output_dim*index2[j]];
              }
            }
            """
            scale_mat, B, index, index2 = self._scale, self.B, self._index, self._index2
            N, N2, output_dim = self._t.size, self._t2.size, self.output_dim
            weave.inline(code, ['index', 'index2',
                                'scale_mat', 'B',
                                'N', 'N2', 'output_dim'])
    def _K_compute_eq(self):
        """Compute covariance for latent covariance."""
        t_eq = self._t[self._index==0]
        if self._t2 is None:
            if t_eq.size==0:
                self._K_eq = np.zeros((0, 0))
                return
            self._dist2 = np.square(t_eq[:, None] - t_eq[None, :])
        else:
            t2_eq = self._t2[self._index2==0]
            if t_eq.size==0 or t2_eq.size==0:
                self._K_eq = np.zeros((t_eq.size, t2_eq.size))
                return
            self._dist2 = np.square(t_eq[:, None] - t2_eq[None, :])
        self._K_eq = np.exp(-self._dist2/(2*self.lengthscale*self.lengthscale))
        if self.is_normalized:
            self._K_eq/=(np.sqrt(2*np.pi)*self.lengthscale)
    def _K_compute_ode_eq(self, transpose=False):
        """Compute the cross covariances between latent exponentiated quadratic and observed ordinary differential equations.
        :param transpose: if set to false the exponentiated quadratic is on the rows of the matrix and is computed according to self._t, if set to true it is on the columns and is computed according to self._t2 (default=False).
        :type transpose: bool"""
        if self._t2 is not None:
            if transpose:
                t_eq = self._t[self._index==0]
                t_ode = self._t2[self._index2>0]
                index_ode = self._index2[self._index2>0]-1
            else:
                t_eq = self._t2[self._index2==0]
                t_ode = self._t[self._index>0]
                index_ode = self._index[self._index>0]-1
        else:
            t_eq = self._t[self._index==0]
            t_ode = self._t[self._index>0]
            index_ode = self._index[self._index>0]-1
        if t_ode.size==0 or t_eq.size==0:
            if transpose:
                self._K_eq_ode = np.zeros((t_eq.shape[0], t_ode.shape[0]))
            else:
                self._K_ode_eq = np.zeros((t_ode.shape[0], t_eq.shape[0]))
            return
        t_ode_mat = t_ode[:, None]
        t_eq_mat = t_eq[None, :]
        if self.delay is not None:
            t_ode_mat -= self.delay[index_ode, None]
        diff_t = (t_ode_mat - t_eq_mat)
        inv_sigma_diff_t = 1./self.sigma*diff_t
        decay_vals = self.decay[index_ode][:, None]
        half_sigma_d_i = 0.5*self.sigma*decay_vals
        if self.is_stationary:
            ln_part, signs = ln_diff_erfs(inf, half_sigma_d_i - inv_sigma_diff_t, return_sign=True)
        else:
            ln_part, signs = ln_diff_erfs(half_sigma_d_i + t_eq_mat/self.sigma, half_sigma_d_i - inv_sigma_diff_t, return_sign=True)
        sK = signs*np.exp(half_sigma_d_i*half_sigma_d_i - decay_vals*diff_t + ln_part)
        sK *= 0.5
        if not self.is_normalized:
            sK *= np.sqrt(np.pi)*self.sigma
        if transpose:
            self._K_eq_ode = sK.T
        else:
            self._K_ode_eq = sK
    def _K_compute_ode(self):
        # Compute covariances between outputs of the ODE models.
        t_ode = self._t[self._index>0]
        index_ode = self._index[self._index>0]-1
        if self._t2 is None:
            if t_ode.size==0:
                self._K_ode = np.zeros((0, 0))
                return        
            t2_ode = t_ode
            index2_ode = index_ode
        else:
            t2_ode = self._t2[self._index2>0]
            if t_ode.size==0 or t2_ode.size==0:
                self._K_ode = np.zeros((t_ode.size, t2_ode.size))
                return
            index2_ode = self._index2[self._index2>0]-1
        # When index is identical
        h = self._compute_H(t_ode, index_ode, t2_ode, index2_ode, stationary=self.is_stationary)
        if self._t2 is None:
            self._K_ode = 0.5 * (h + h.T)
        else:
            h2 = self._compute_H(t2_ode, index2_ode, t_ode, index_ode, stationary=self.is_stationary)                
            self._K_ode = 0.5 * (h + h2.T)
        if not self.is_normalized:
            self._K_ode *= np.sqrt(np.pi)*self.sigma
    def _compute_diag_H(self, t, index, update_derivatives=False, stationary=False):
        """Helper function for computing H for the diagonal only.
        :param t: time input.
        :type t: array
        :param index: first output indices
        :type index: array of int.
        :param index: second output indices
        :type index: array of int.
        :param update_derivatives: whether or not to update the derivative portions (default False).
        :type update_derivatives: bool
        :param stationary: whether to compute the stationary version of the covariance (default False).
        :type stationary: bool"""
        """if delta_i~=delta_j:
            [h, dh_dD_i, dh_dD_j, dh_dsigma] = np.diag(simComputeH(t, index, t, index, update_derivatives=True, stationary=self.is_stationary))
        else:
            Decay = self.decay[index]
            if self.delay is not None:
                t = t - self.delay[index]
            t_squared = t*t
            half_sigma_decay = 0.5*self.sigma*Decay
            [ln_part_1, sign1] = ln_diff_erfs(half_sigma_decay + t/self.sigma,
                                              half_sigma_decay)
            [ln_part_2, sign2] = ln_diff_erfs(half_sigma_decay,
                                              half_sigma_decay - t/self.sigma)
            h = (sign1*np.exp(half_sigma_decay*half_sigma_decay
                             + ln_part_1
                             - log(Decay + D_j)) 
                 - sign2*np.exp(half_sigma_decay*half_sigma_decay
                                - (Decay + D_j)*t
                                + ln_part_2 
                                - log(Decay + D_j)))
            sigma2 = self.sigma*self.sigma
        if update_derivatives:
            dh_dD_i = ((0.5*Decay*sigma2*(Decay + D_j)-1)*h 
                       + t*sign2*np.exp(
                half_sigma_decay*half_sigma_decay-(Decay+D_j)*t + ln_part_2
                )
                       + self.sigma/np.sqrt(np.pi)*
                       (-1 + np.exp(-t_squared/sigma2-Decay*t)
                        + np.exp(-t_squared/sigma2-D_j*t)
                        - np.exp(-(Decay + D_j)*t)))
            dh_dD_i = (dh_dD_i/(Decay+D_j)).real
            dh_dD_j = (t*sign2*np.exp(
                half_sigma_decay*half_sigma_decay-(Decay + D_j)*t+ln_part_2
                )
                       -h)
            dh_dD_j = (dh_dD_j/(Decay + D_j)).real
            dh_dsigma = 0.5*Decay*Decay*self.sigma*h \
                        + 2/(np.sqrt(np.pi)*(Decay+D_j))\
                        *((-Decay/2) \
                          + (-t/sigma2+Decay/2)*np.exp(-t_squared/sigma2 - Decay*t) \
                          - (-t/sigma2-Decay/2)*np.exp(-t_squared/sigma2 - D_j*t) \
                          - Decay/2*np.exp(-(Decay+D_j)*t))"""
        pass
    def _compute_H(self, t, index, t2, index2, update_derivatives=False, stationary=False):
        """Helper function for computing part of the ode1 covariance function.
        :param t: first time input.
        :type t: array
        :param index: Indices of first output.
        :type index: array of int
        :param t2: second time input.
        :type t2: array
        :param index2: Indices of second output.
        :type index2: array of int
        :param update_derivatives: whether to update derivatives (default is False)
        :return h : result of this subcomponent of the kernel for the given values.
        :rtype: ndarray
 """
        if stationary:
            raise NotImplementedError, "Error, stationary version of this covariance not yet implemented."
        # Vector of decays and delays associated with each output.
        Decay = self.decay[index]
        Decay2 = self.decay[index2]
        t_mat = t[:, None]
        t2_mat = t2[None, :]
        if self.delay is not None:
            Delay = self.delay[index]
            Delay2 = self.delay[index2]
            t_mat-=Delay[:, None]
            t2_mat-=Delay2[None, :]
        diff_t = (t_mat - t2_mat)
        inv_sigma_diff_t = 1./self.sigma*diff_t
        half_sigma_decay_i = 0.5*self.sigma*Decay[:, None]
        ln_part_1, sign1 = ln_diff_erfs(half_sigma_decay_i + t2_mat/self.sigma, 
                                        half_sigma_decay_i - inv_sigma_diff_t,
                                        return_sign=True)
        ln_part_2, sign2 = ln_diff_erfs(half_sigma_decay_i,
                                        half_sigma_decay_i - t_mat/self.sigma,
                                        return_sign=True)
        h = sign1*np.exp(half_sigma_decay_i
                         *half_sigma_decay_i
                         -Decay[:, None]*diff_t+ln_part_1
                         -np.log(Decay[:, None] + Decay2[None, :]))
        h -= sign2*np.exp(half_sigma_decay_i*half_sigma_decay_i
                          -Decay[:, None]*t_mat-Decay2[None, :]*t2_mat+ln_part_2
                          -np.log(Decay[:, None] + Decay2[None, :]))
        if update_derivatives:
            sigma2 = self.sigma*self.sigma
            # Update ith decay gradient
            dh_ddecay = ((0.5*Decay[:, None]*sigma2*(Decay[:, None] + Decay2[None, :])-1)*h
                         + (-diff_t*sign1*np.exp(
                half_sigma_decay_i*half_sigma_decay_i-Decay[:, None]*diff_t+ln_part_1
                )
                            +t_mat*sign2*np.exp(
                half_sigma_decay_i*half_sigma_decay_i-Decay[:, None]*t_mat
                - Decay2*t2_mat+ln_part_2))
                         +self.sigma/np.sqrt(np.pi)*(
                -np.exp(
                -diff_t*diff_t/sigma2
                )+np.exp(
                -t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat
                )+np.exp(
                -t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat
                )-np.exp(
                -(Decay[:, None]*t_mat + Decay2[None, :]*t2_mat)
                )
                ))
            self._dh_ddecay = (dh_ddecay/(Decay[:, None]+Decay2[None, :])).real
            # Update jth decay gradient
            dh_ddecay2 = (t2_mat*sign2
                         *np.exp(
                half_sigma_decay_i*half_sigma_decay_i
                -(Decay[:, None]*t_mat + Decay2[None, :]*t2_mat)
                +ln_part_2
                )
                         -h)
            self._dh_ddecay2 = (dh_ddecay/(Decay[:, None] + Decay2[None, :])).real
            # Update sigma gradient
            self._dh_dsigma = (half_sigma_decay_i*Decay[:, None]*h
                               + 2/(np.sqrt(np.pi)
                                    *(Decay[:, None]+Decay2[None, :]))
                               *((-diff_t/sigma2-Decay[:, None]/2)
                                 *np.exp(-diff_t*diff_t/sigma2)
                                 + (-t2_mat/sigma2+Decay[:, None]/2)
                                 *np.exp(-t2_mat*t2_mat/sigma2-Decay[:, None]*t_mat) 
                                 - (-t_mat/sigma2-Decay[:, None]/2) 
                                 *np.exp(-t_mat*t_mat/sigma2-Decay2[None, :]*t2_mat) 
                                 - Decay[:, None]/2
                                 *np.exp(-(Decay[:, None]*t_mat+Decay2[None, :]*t2_mat))))
        return h
--- a/GPy/kern/parts/hetero.py
+++ b/GPy/kern/parts/hetero.py
@ -10,9 +10,12 @@ import GPy
 class Hetero(Kernpart):
    """
-    TODO: Need to constrain the function outputs positive (still thinking of best way of doing this!!! Yes, intend to use transformations, but what's the *best* way). Currently just squaring output.
+    TODO: Need to constrain the function outputs
    positive (still thinking of best way of doing this!!! Yes, intend to use
    transformations, but what's the *best* way). Currently just squaring output.
-    Heteroschedastic noise which depends on input location. See, for example, this paper by Goldberg et al.
+    Heteroschedastic noise which depends on input location. See, for example,
    this paper by Goldberg et al.
    .. math::
@ -20,15 +23,15 @@ class Hetero(Kernpart):
       where :math:`\sigma^2(x)` is a function giving the variance  as a function of input space and :math:`\delta_{i,j}` is the Kronecker delta function.
-        The parameters are the parameters of \sigma^2(x) which is a
+    The parameters are the parameters of \sigma^2(x) which is a
-        function that can be specified by the user, by default an
+    function that can be specified by the user, by default an
-        multi-layer peceptron is used.
+    multi-layer peceptron is used.
-        :param input_dim: the number of input dimensions
+    :param input_dim: the number of input dimensions
-        :type input_dim: int 
+    :type input_dim: int
-        :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
+    :param mapping: the mapping that gives the lengthscale across the input space (by default GPy.mappings.MLP is used with 20 hidden nodes).
-        :type mapping: GPy.core.Mapping
+    :type mapping: GPy.core.Mapping
-        :rtype: Kernpart object
+    :rtype: Kernpart object
    See this paper:
@ -36,7 +39,7 @@ class Hetero(Kernpart):
    C. M. (1998) Regression with Input-dependent Noise: a Gaussian
    Process Treatment In Advances in Neural Information Processing
    Systems, Volume 10, pp.  493-499. MIT Press
-    
+
    for a Gaussian process treatment of this problem.
    """
@ -47,7 +50,7 @@ class Hetero(Kernpart):
            mapping = GPy.mappings.MLP(output_dim=1, hidden_dim=20, input_dim=input_dim)
        if not transform:
            transform = GPy.core.transformations.logexp()
-            
+
        self.transform = transform
        self.mapping = mapping
        self.name='hetero'
@ -66,7 +69,7 @@ class Hetero(Kernpart):
    def K(self, X, X2, target):
        """Return covariance between X and X2."""
-        if X2==None or X2 is X:
+        if (X2 is None) or (X2 is X):
            target[np.diag_indices_from(target)] += self._Kdiag(X)
    def Kdiag(self, X, target):
@ -76,26 +79,26 @@ class Hetero(Kernpart):
    def _Kdiag(self, X):
        """Helper function for computing the diagonal elements of the covariance."""
        return self.mapping.f(X).flatten()**2
-    
+
    def dK_dtheta(self, dL_dK, X, X2, target):
        """Derivative of the covariance with respect to the parameters."""
-        if X2==None or X2 is X:
+        if (X2 is None) or (X2 is X):
            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
            self.dKdiag_dtheta(dL_dKdiag, X, target)
    def dKdiag_dtheta(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to parameters."""
-        target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None], X)*self.mapping.f(X)
+        target += 2.*self.mapping.df_dtheta(dL_dKdiag[:, None]*self.mapping.f(X), X)
    def dK_dX(self, dL_dK, X, X2, target):
        """Derivative of the covariance matrix with respect to X."""
        if X2==None or X2 is X:
            dL_dKdiag = dL_dK.flat[::dL_dK.shape[0]+1]
            self.dKdiag_dX(dL_dKdiag, X, target)
-    
+
    def dKdiag_dX(self, dL_dKdiag, X, target):
        """Gradient of diagonal of covariance with respect to X."""
        target += 2.*self.mapping.df_dX(dL_dKdiag[:, None], X)*self.mapping.f(X)
-    
+
--- a/GPy/kern/parts/kernpart.py
+++ b/GPy/kern/parts/kernpart.py
@ -58,6 +58,8 @@ class Kernpart(object):
        raise NotImplementedError
    def dK_dX(self, dL_dK, X, X2, target):
        raise NotImplementedError
    def dKdiag_dX(self, dL_dK, X, target):
        raise NotImplementedError
@ -97,6 +99,9 @@ class Kernpart_stationary(Kernpart):
        # wrt lengthscale is 0.
        target[0] += np.sum(dL_dKdiag)
    def dKdiag_dX(self, dL_dK, X, target):
        pass # true for all stationary kernels
 class Kernpart_inner(Kernpart):
    def __init__(self,input_dim):
--- a/GPy/kern/parts/mlp.py
+++ b/GPy/kern/parts/mlp.py
@ -7,11 +7,13 @@ four_over_tau = 2./np.pi
 class MLP(Kernpart):
    """
-    multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
+
    Multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
    .. math::
-       k(x,y) = \sigma^2 \frac{2}{\pi}  \text{asin} \left(\frac{\sigma_w^2 x^\top y+\sigma_b^2}{\sqrt{\sigma_w^2x^\top x + \sigma_b^2 + 1}\sqrt{\sigma_w^2 y^\top y \sigma_b^2 +1}} \right)
+          k(x,y) = \\sigma^{2}\\frac{2}{\\pi }  \\text{asin} \\left ( \\frac{ \\sigma_w^2 x^\\top y+\\sigma_b^2}{\\sqrt{\\sigma_w^2x^\\top x + \\sigma_b^2 + 1}\\sqrt{\\sigma_w^2 y^\\top y \\sigma_b^2 +1}} \\right )
    :param input_dim: the number of input dimensions
    :type input_dim: int 
@ -24,6 +26,7 @@ class MLP(Kernpart):
    :type ARD: Boolean
    :rtype: Kernpart object
    """
    def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=100., ARD=False):
--- a/GPy/kern/parts/odekern1.c
+++ b/GPy/kern/parts/odekern1.c
@ -0,0 +1,38 @@
 #include <math.h> 
 double k_uu(t1,t2,theta1,theta2,sig1,sig2)
 {
  double kern=0;
  double dist=0;
  dist = sqrt(t2*t2-t1*t1) 
  kern = sig1*(1+theta1*dist)*exp(-theta1*dist)
 return kern;
 }
 double k_yy(t1, t2, theta1,theta2,sig1,sig2)
 {
  double kern=0;
  double dist=0;
  dist = sqrt(t2*t2-t1*t1) 
  kern = sig1*sig2 * (  exp(-theta1*dist)*(theta2-2*theta1+theta1*theta2*dist-theta1*theta1*dist) +
  	exp(-dist)  ) / ((theta2-theta1)*(theta2-theta1))
  return kern;
 } 
--- a/GPy/kern/parts/poly.py
+++ b/GPy/kern/parts/poly.py
@ -7,22 +7,22 @@ four_over_tau = 2./np.pi
 class POLY(Kernpart):
    """
    polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel,
    .. math::
    k(x, y) = \sigma^2*(\sigma_w^2 x'y+\sigma_b^b)^d
-    The kernel parameters are \sigma^2 (variance), \sigma^2_w
+    Polynomial kernel parameter initialisation.  Included for completeness, but generally not recommended, is the polynomial kernel:
-    (weight_variance), \sigma^2_b (bias_variance) and d
+
    .. math::
        k(x, y) = \sigma^2\*(\sigma_w^2 x'y+\sigma_b^b)^d
    The kernel parameters are :math:`\sigma^2` (variance), :math:`\sigma^2_w`
    (weight_variance), :math:`\sigma^2_b` (bias_variance) and d
    (degree). Only gradients of the first three are provided for
    kernel optimisation, it is assumed that polynomial degree would
    be set by hand.
    The kernel is not recommended as it is badly behaved when the
-    \sigma^2_w*x'*y + \sigma^2_b has a magnitude greater than one. For completeness
+    :math:`\sigma^2_w\*x'\*y + \sigma^2_b` has a magnitude greater than one. For completeness
    there is an automatic relevance determination version of this
-    kernel provided.
+    kernel provided (NOTE YET IMPLEMENTED!).
    :param input_dim: the number of input dimensions
    :type input_dim: int 
    :param variance: the variance :math:`\sigma^2`
@ -32,7 +32,7 @@ class POLY(Kernpart):
    :param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
    :param degree: the degree of the polynomial.
    :type degree: int
-    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
+    :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter :math:`\sigma^2_w`), otherwise there is one weight variance parameter per dimension.
    :type ARD: Boolean
    :rtype: Kernpart object
--- a/GPy/kern/parts/sympy_helpers.cpp
+++ b/GPy/kern/parts/sympy_helpers.cpp
@ -1,6 +1,7 @@
 #include <math.h>
 double DiracDelta(double x){
-    if((x<0.000001) & (x>-0.000001))//go on, laught at my c++ skills
+  // TODO: this doesn't seem to be a dirac delta ... should return infinity. Neil
    if((x<0.000001) & (x>-0.000001))//go on, laugh at my c++ skills
        return 1.0;
    else
        return 0.0;
@ -8,3 +9,17 @@ double DiracDelta(double x){
 double DiracDelta(double x,int foo){
    return 0.0;
 };
 double sinc(double x){
  if (x==0)
    return 1.0;
  else 
    return sin(x)/x;
 }
 double sinc_grad(double x){
  if (x==0)
    return 0.0;
  else 
    return (x*cos(x) - sin(x))/(x*x);
 }
--- a/GPy/kern/parts/sympy_helpers.h
+++ b/GPy/kern/parts/sympy_helpers.h
@ -1,3 +1,6 @@
 #include <math.h>
 double DiracDelta(double x);
 double DiracDelta(double x, int foo);
 double sinc(double x);
 double sinc_grad(double x);
--- a/GPy/kern/parts/sympykern.py
+++ b/GPy/kern/parts/sympykern.py
@ -26,8 +26,11 @@ class spkern(Kernpart):
     - to handle multiple inputs, call them x1, z1, etc
     - to handle multpile correlated outputs, you'll need to define each covariance function and 'cross' variance function. TODO
    """
-    def __init__(self,input_dim,k,param=None):
+    def __init__(self,input_dim,k,name=None,param=None):
-        self.name='sympykern'
+        if name is None:
            self.name='sympykern'
        else:
            self.name = name
        self._sp_k = k
        sp_vars = [e for e in k.atoms() if e.is_Symbol]
        self._sp_x= sorted([e for e in sp_vars if e.name[0]=='x'],key=lambda x:int(x.name[1:]))
@ -56,9 +59,9 @@ class spkern(Kernpart):
        self.weave_kwargs = {\
            'support_code':self._function_code,\
-            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'kern/')],\
+            'include_dirs':[tempfile.gettempdir(), os.path.join(current_dir,'parts/')],\
            'headers':['"sympy_helpers.h"'],\
-            'sources':[os.path.join(current_dir,"kern/sympy_helpers.cpp")],\
+            'sources':[os.path.join(current_dir,"parts/sympy_helpers.cpp")],\
            #'extra_compile_args':['-ftree-vectorize', '-mssse3', '-ftree-vectorizer-verbose=5'],\
            'extra_compile_args':[],\
            'extra_link_args':['-lgomp'],\
@ -109,14 +112,15 @@ class spkern(Kernpart):
        f.write(self._function_header)
        f.close()
-        #get rid of derivatives of DiracDelta
+        # Substitute any known derivatives which sympy doesn't compute
        self._function_code = re.sub('DiracDelta\(.+?,.+?\)','0.0',self._function_code)
-        #Here's some code to do the looping for K
+        # Here's the code to do the looping for K
-        arglist = ", ".join(["X[i*input_dim+%s]"%x.name[1:] for x in self._sp_x]\
+        arglist = ", ".join(["X[i*input_dim+%s]"%x.name[1:] for x in self._sp_x]
-                + ["Z[j*input_dim+%s]"%z.name[1:] for z in self._sp_z]\
+                            + ["Z[j*input_dim+%s]"%z.name[1:] for z in self._sp_z]
-                + ["param[%i]"%i for i in range(self.num_params)])
+                            + ["param[%i]"%i for i in range(self.num_params)])
        self._K_code =\
        """
        int i;
@ -133,9 +137,14 @@ class spkern(Kernpart):
        %s
        """%(arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
        # Similar code when only X is provided. 
        self._K_code_X = self._K_code.replace('Z[', 'X[')
        # Code to compute diagonal of covariance.
        diag_arglist = re.sub('Z','X',arglist)
        diag_arglist = re.sub('j','i',diag_arglist)
-        #Here's some code to do the looping for Kdiag
+        # Code to do the looping for Kdiag
        self._Kdiag_code =\
        """
        int i;
@ -148,8 +157,9 @@ class spkern(Kernpart):
        %s
        """%(diag_arglist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-        #here's some code to compute gradients
+        # Code to compute gradients
        funclist = '\n'.join([' '*16 + 'target[%i] += partial[i*num_inducing+j]*dk_d%s(%s);'%(i,theta.name,arglist) for i,theta in  enumerate(self._sp_theta)])
        self._dK_dtheta_code =\
        """
        int i;
@ -164,9 +174,12 @@ class spkern(Kernpart):
            }
        }
        %s
-        """%(funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+        """%(funclist,"/*"+str(self._sp_k)+"*/") # adding a string representation forces recompile when needed
-        #here's some code to compute gradients for Kdiag TODO: thius is yucky.
+        # Similar code when only X is provided, change argument lists.
        self._dK_dtheta_code_X = self._dK_dtheta_code.replace('Z[', 'X[')
        # Code to compute gradients for Kdiag TODO: needs clean up
        diag_funclist = re.sub('Z','X',funclist,count=0)
        diag_funclist = re.sub('j','i',diag_funclist)
        diag_funclist = re.sub('partial\[i\*num_inducing\+i\]','partial[i]',diag_funclist)
@ -181,8 +194,12 @@ class spkern(Kernpart):
        %s
        """%(diag_funclist,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
-        #Here's some code to do gradients wrt x
+        # Code for gradients wrt X
        gradient_funcs = "\n".join(["target[i*input_dim+%i] += partial[i*num_inducing+j]*dk_dx%i(%s);"%(q,q,arglist) for q in range(self.input_dim)])
        if False:
            gradient_funcs += """if(isnan(target[i*input_dim+2])){printf("%%f\\n",dk_dx2(X[i*input_dim+0], X[i*input_dim+1], X[i*input_dim+2], Z[j*input_dim+0], Z[j*input_dim+1], Z[j*input_dim+2], param[0], param[1], param[2], param[3], param[4], param[5]));}
            if(isnan(target[i*input_dim+2])){printf("%%f,%%f,%%i,%%i\\n", X[i*input_dim+2], Z[j*input_dim+2],i,j);}"""
        self._dK_dX_code = \
        """
        int i;
@ -192,30 +209,34 @@ class spkern(Kernpart):
        int input_dim = X_array->dimensions[1];
        //#pragma omp parallel for private(j)
        for (i=0;i<N; i++){
-            for (j=0; j<num_inducing; j++){
+          for (j=0; j<num_inducing; j++){
-                %s
+            %s
-                //if(isnan(target[i*input_dim+2])){printf("%%f\\n",dk_dx2(X[i*input_dim+0], X[i*input_dim+1], X[i*input_dim+2], Z[j*input_dim+0], Z[j*input_dim+1], Z[j*input_dim+2], param[0], param[1], param[2], param[3], param[4], param[5]));}
+          }
                //if(isnan(target[i*input_dim+2])){printf("%%f,%%f,%%i,%%i\\n", X[i*input_dim+2], Z[j*input_dim+2],i,j);}
            }
        }
        %s
        """%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
        # Create code for call when just X is passed as argument.
        self._dK_dX_code_X = self._dK_dX_code.replace('Z[', 'X[').replace('+= partial[', '+= 2*partial[')
-        #now for gradients of Kdiag wrt X
+        diag_gradient_funcs = re.sub('Z','X',gradient_funcs,count=0)
        diag_gradient_funcs = re.sub('j','i',diag_gradient_funcs)
        diag_gradient_funcs = re.sub('partial\[i\*num_inducing\+i\]','2*partial[i]',diag_gradient_funcs)
        # Code for gradients of Kdiag wrt X
        self._dKdiag_dX_code= \
        """
        int i;
        int j;
        int N = partial_array->dimensions[0];
        int num_inducing = 0;
        int input_dim = X_array->dimensions[1];
-        for (i=0;i<N; i++){
+        for (int i=0;i<N; i++){
            j = i;
            %s
        }
        %s
-        """%(gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a string representation forces recompile when needed
+        """%(diag_gradient_funcs,"/*"+str(self._sp_k)+"*/") #adding a
        # string representation forces recompile when needed Get rid
        # of Zs in argument for diagonal. TODO: Why wasn't
        # diag_funclist called here? Need to check that.
        #self._dKdiag_dX_code = self._dKdiag_dX_code.replace('Z[j', 'X[i')
        #TODO: insert multiple functions here via string manipulation
@ -223,7 +244,10 @@ class spkern(Kernpart):
    def K(self,X,Z,target):
        param = self._param
-        weave.inline(self._K_code,arg_names=['target','X','Z','param'],**self.weave_kwargs)
+        if Z is None:
            weave.inline(self._K_code_X,arg_names=['target','X','param'],**self.weave_kwargs)
        else:
            weave.inline(self._K_code,arg_names=['target','X','Z','param'],**self.weave_kwargs)
    def Kdiag(self,X,target):
        param = self._param
@ -231,21 +255,25 @@ class spkern(Kernpart):
    def dK_dtheta(self,partial,X,Z,target):
        param = self._param
-        weave.inline(self._dK_dtheta_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
+        if Z is None:
            weave.inline(self._dK_dtheta_code_X, arg_names=['target','X','param','partial'],**self.weave_kwargs)
        else:
            weave.inline(self._dK_dtheta_code, arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
    def dKdiag_dtheta(self,partial,X,target):
        param = self._param
-        Z = X
+        weave.inline(self._dKdiag_dtheta_code,arg_names=['target','X','param','partial'],**self.weave_kwargs)
        weave.inline(self._dKdiag_dtheta_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
    def dK_dX(self,partial,X,Z,target):
        param = self._param
-        weave.inline(self._dK_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
+        if Z is None:
            weave.inline(self._dK_dX_code_X,arg_names=['target','X','param','partial'],**self.weave_kwargs)
        else:
            weave.inline(self._dK_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
    def dKdiag_dX(self,partial,X,target):
        param = self._param
-        Z = X
+        weave.inline(self._dKdiag_dX_code,arg_names=['target','X','param','partial'],**self.weave_kwargs)
        weave.inline(self._dKdiag_dX_code,arg_names=['target','X','Z','param','partial'],**self.weave_kwargs)
    def _set_params(self,param):
        #print param.flags['C_CONTIGUOUS']
--- a/GPy/likelihoods/ep.py
+++ b/GPy/likelihoods/ep.py
@ -4,33 +4,32 @@ from ..util.linalg import pdinv,mdot,jitchol,chol_inv,DSYR,tdot,dtrtrs
 from likelihood import likelihood
 class EP(likelihood):
-    def __init__(self,data,noise_model,epsilon=1e-3,power_ep=[1.,1.]):
+    def __init__(self,data,noise_model):
        """
        Expectation Propagation
-        Arguments
+        :param data: data to model
-        ---------
+        :type data: numpy array
-        epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
+        :param noise_model: noise distribution
-        noise_model : a likelihood function (see likelihood_functions.py)
+        :type noise_model: A GPy noise model
        """
        self.noise_model = noise_model
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        self.data = data
-        self.N, self.output_dim = self.data.shape
+        self.num_data, self.output_dim = self.data.shape
        self.is_heteroscedastic = True
        self.Nparams = 0
        self._transf_data = self.noise_model._preprocess_values(data)
        #Initial values - Likelihood approximation parameters:
        #p(y|f) = t(f|tau_tilde,v_tilde)
-        self.tau_tilde = np.zeros(self.N)
+        self.tau_tilde = np.zeros(self.num_data)
-        self.v_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.num_data)
        #initial values for the GP variables
-        self.Y = np.zeros((self.N,1))
+        self.Y = np.zeros((self.num_data,1))
-        self.covariance_matrix = np.eye(self.N)
+        self.covariance_matrix = np.eye(self.num_data)
-        self.precision = np.ones(self.N)[:,None]
+        self.precision = np.ones(self.num_data)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
@ -40,11 +39,11 @@ class EP(likelihood):
        super(EP, self).__init__()
    def restart(self):
-        self.tau_tilde = np.zeros(self.N)
+        self.tau_tilde = np.zeros(self.num_data)
-        self.v_tilde = np.zeros(self.N)
+        self.v_tilde = np.zeros(self.num_data)
-        self.Y = np.zeros((self.N,1))
+        self.Y = np.zeros((self.num_data,1))
-        self.covariance_matrix = np.eye(self.N)
+        self.covariance_matrix = np.eye(self.num_data)
-        self.precision = np.ones(self.N)[:,None]
+        self.precision = np.ones(self.num_data)[:,None]
        self.Z = 0
        self.YYT = None
        self.V = self.precision * self.Y
@ -78,6 +77,7 @@ class EP(likelihood):
        sigma_sum = 1./self.tau_ + 1./self.tau_tilde
        mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2
        self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep
        self.Z += 0.5*self.num_data*np.log(2*np.pi)
        self.Y =  mu_tilde[:,None]
        self.YYT = np.dot(self.Y,self.Y.T)
@ -87,13 +87,22 @@ class EP(likelihood):
        self.VVT_factor = self.V
        self.trYYT = np.trace(self.YYT)
-    def fit_full(self,K):
+    def fit_full(self, K, epsilon=1e-3,power_ep=[1.,1.]):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006.
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        :type epsilon: float
        :param power_ep: Power EP parameters
        :type power_ep: list of floats
        """
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma)
-        mu = np.zeros(self.N)
+        mu = np.zeros(self.num_data)
        Sigma = K.copy()
        """
@ -102,15 +111,15 @@ class EP(likelihood):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        self.tau_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.num_data,dtype=float)
        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
+        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.N,dtype=float)
+        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.num_data,dtype=float)
        #Approximation
        epsilon_np1 = self.epsilon + 1.
@ -119,7 +128,7 @@ class EP(likelihood):
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
+            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i]
@ -137,23 +146,32 @@ class EP(likelihood):
                self.iterations += 1
            #Sigma recomptutation with Cholesky decompositon
            Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K
-            B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
+            B = np.eye(self.num_data) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
            L = jitchol(B)
            V,info = dtrtrs(L,Sroot_tilde_K,lower=1)
            Sigma = K - np.dot(V.T,V)
            mu = np.dot(Sigma,self.v_tilde)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
        return self._compute_GP_variables()
-    def fit_DTC(self, Kmm, Kmn):
+    def fit_DTC(self, Kmm, Kmn, epsilon=1e-3,power_ep=[1.,1.]):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see ... 2013.
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        :type epsilon: float
        :param power_ep: Power EP parameters
        :type power_ep: list of floats
        """
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        num_inducing = Kmm.shape[0]
        #TODO: this doesn't work with uncertain inputs!
@ -182,7 +200,7 @@ class EP(likelihood):
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
-        mu = np.zeros(self.N)
+        mu = np.zeros(self.num_data)
        LLT = Kmm.copy()
        Sigma_diag = Qnn_diag.copy()
@ -192,15 +210,15 @@ class EP(likelihood):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        self.tau_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.num_data,dtype=float)
        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
+        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.N,dtype=float)
+        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.num_data,dtype=float)
        #Approximation
        epsilon_np1 = 1
@ -209,7 +227,7 @@ class EP(likelihood):
        np1 = [self.tau_tilde.copy()]
        np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
+            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
@ -238,18 +256,26 @@ class EP(likelihood):
            Sigma_diag = np.sum(V*V,-2)
            Knmv_tilde = np.dot(Kmn,self.v_tilde)
            mu = np.dot(V2.T,Knmv_tilde)
-            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.N
+            epsilon_np1 = sum((self.tau_tilde-np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-np2[-1])**2)/self.num_data
            np1.append(self.tau_tilde.copy())
            np2.append(self.v_tilde.copy())
        self._compute_GP_variables()
-    def fit_FITC(self, Kmm, Kmn, Knn_diag):
+    def fit_FITC(self, Kmm, Kmn, Knn_diag, epsilon=1e-3,power_ep=[1.,1.]):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
        For nomenclature see Naish-Guzman and Holden, 2008.
        :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
        :type epsilon: float
        :param power_ep: Power EP parameters
        :type power_ep: list of floats
        """
        self.epsilon = epsilon
        self.eta, self.delta = power_ep
        num_inducing = Kmm.shape[0]
        """
@ -272,9 +298,9 @@ class EP(likelihood):
        Sigma = Diag + P*R.T*R*P.T + K
        mu = w + P*Gamma
        """
-        self.w = np.zeros(self.N)
+        self.w = np.zeros(self.num_data)
        self.Gamma = np.zeros(num_inducing)
-        mu = np.zeros(self.N)
+        mu = np.zeros(self.num_data)
        P = P0.copy()
        R = R0.copy()
        Diag = Diag0.copy()
@ -287,15 +313,15 @@ class EP(likelihood):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        self.tau_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.num_data,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
+        self.v_ = np.empty(self.num_data,dtype=float)
        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
+        z = np.empty(self.num_data,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
+        self.Z_hat = np.empty(self.num_data,dtype=float)
-        phi = np.empty(self.N,dtype=float)
+        phi = np.empty(self.num_data,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
+        mu_hat = np.empty(self.num_data,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
+        sigma2_hat = np.empty(self.num_data,dtype=float)
        #Approximation
        epsilon_np1 = 1
@ -304,7 +330,7 @@ class EP(likelihood):
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.random.permutation(self.N)
+            update_order = np.random.permutation(self.num_data)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i]
@ -343,8 +369,8 @@ class EP(likelihood):
            self.w = Diag * self.v_tilde
            self.Gamma = np.dot(R.T, np.dot(RPT,self.v_tilde))
            mu = self.w + np.dot(P,self.Gamma)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
+            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.num_data
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.num_data
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
--- a/GPy/likelihoods/likelihood.py
+++ b/GPy/likelihoods/likelihood.py
@ -10,14 +10,16 @@ class likelihood(Parameterized):
    (Gaussian) inherits directly from this, as does the EP algorithm
    Some things must be defined for this to work properly:
-    self.Y : the effective Gaussian target of the GP
+
-    self.N, self.D : Y.shape
+        - self.Y : the effective Gaussian target of the GP
-    self.covariance_matrix : the effective (noise) covariance of the GP targets
+        - self.N, self.D : Y.shape
-    self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
+        - self.covariance_matrix : the effective (noise) covariance of the GP targets
-    self.is_heteroscedastic : enables significant computational savings in GP
+        - self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP)
-    self.precision : a scalar or vector representation of the effective target precision
+        - self.is_heteroscedastic : enables significant computational savings in GP
-    self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
+        - self.precision : a scalar or vector representation of the effective target precision
-    self.V : self.precision * self.Y
+        - self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N
        - self.V : self.precision * self.Y
    """
    def __init__(self):
        Parameterized.__init__(self)
--- a/GPy/likelihoods/noise_models/binomial_noise.py
+++ b/GPy/likelihoods/noise_models/binomial_noise.py
@ -1,6 +1,7 @@
 # Copyright (c) 2012, 2013 Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import stats,special
 import scipy as sp
@ -116,18 +117,3 @@ class Binomial(NoiseDistribution):
    def _d2variance_dgp2(self,gp):
        return self.gp_link.d2transf_df2(gp)*(1. - 2.*self.gp_link.transf(gp)) - 2*self.gp_link.dtransf_df(gp)**2
    """
    def predictive_values(self,mu,var): #TODO remove
        mu = mu.flatten()
        var = var.flatten()
        #mean = stats.norm.cdf(mu/np.sqrt(1+var))
        mean = self._predictive_mean_analytical(mu,np.sqrt(var))
        norm_025 = [stats.norm.ppf(.025,m,v) for m,v in zip(mu,var)]
        norm_975 = [stats.norm.ppf(.975,m,v) for m,v in zip(mu,var)]
        #p_025 = stats.norm.cdf(norm_025/np.sqrt(1+var))
        #p_975 = stats.norm.cdf(norm_975/np.sqrt(1+var))
        p_025 = self._predictive_mean_analytical(norm_025,np.sqrt(var))
        p_975 = self._predictive_mean_analytical(norm_975,np.sqrt(var))
        return mean[:,None], np.nan*var, p_025[:,None], p_975[:,None] # TODO: var
    """
--- a/GPy/likelihoods/noise_models/exponential_noise.py
+++ b/GPy/likelihoods/noise_models/exponential_noise.py
@ -11,7 +11,7 @@ from noise_distributions import NoiseDistribution
 class Exponential(NoiseDistribution):
    """
-    Gamma likelihood
+    Expoential likelihood
    Y is expected to take values in {0,1,2,...}
    -----
    $$
--- a/GPy/likelihoods/noise_models/gaussian_noise.py
+++ b/GPy/likelihoods/noise_models/gaussian_noise.py
@ -57,12 +57,12 @@ class Gaussian(NoiseDistribution):
        new_sigma2 = self.predictive_variance(mu,sigma)
        return new_sigma2*(mu/sigma**2 + self.gp_link.transf(mu)/self.variance)
-    def _predictive_variance_analytical(self,mu,sigma,*args): #TODO *args?
+    def _predictive_variance_analytical(self,mu,sigma):
        return 1./(1./self.variance + 1./sigma**2)
    def _mass(self,gp,obs):
        #return std_norm_pdf( (self.gp_link.transf(gp)-obs)/np.sqrt(self.variance) )
-        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance)) #FIXME
+        return stats.norm.pdf(obs,self.gp_link.transf(gp),np.sqrt(self.variance))
    def _nlog_mass(self,gp,obs):
        return .5*((self.gp_link.transf(gp)-obs)**2/self.variance + np.log(2.*np.pi*self.variance))
--- a/GPy/likelihoods/noise_models/gp_transformations.py
+++ b/GPy/likelihoods/noise_models/gp_transformations.py
@ -13,20 +13,38 @@ class GPTransformation(object):
    Link function class for doing non-Gaussian likelihoods approximation
    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the likelihood_function used
+
    .. note:: Y values allowed depend on the likelihood_function used
    """
    def __init__(self):
        pass
    def transf(self,f):
        """
        Gaussian process tranformation function, latent space -> output space
        """
        pass
    def dtransf_df(self,f):
        """
        derivative of transf(f) w.r.t. f
        """
        pass
    def d2transf_df2(self,f):
        """
        second derivative of transf(f) w.r.t. f
        """
        pass
 class Identity(GPTransformation):
    """
-    $$
+    .. math::
    g(f) = f
    $$
    """
    #def transf(self,mu):
    #    return mu
        g(f) = f
    """
    def transf(self,f):
        return f
@ -39,13 +57,11 @@ class Identity(GPTransformation):
 class Probit(GPTransformation):
    """
-    $$
+    .. math::
    g(f) = \\Phi^{-1} (mu)
    $$
    """
    #def transf(self,mu):
    #    return inv_std_norm_cdf(mu)
        g(f) = \\Phi^{-1} (mu)
    """
    def transf(self,f):
        return std_norm_cdf(f)
@ -57,13 +73,11 @@ class Probit(GPTransformation):
 class Log(GPTransformation):
    """
-    $$
+    .. math::
    g(f) = \log(\mu)
    $$
    """
    #def transf(self,mu):
    #    return np.log(mu)
        g(f) = \\log(\\mu)
    """
    def transf(self,f):
        return np.exp(f)
@ -75,20 +89,12 @@ class Log(GPTransformation):
 class Log_ex_1(GPTransformation):
    """
-    $$
+    .. math::
    g(f) = \log(\exp(\mu) - 1)
    $$
    """
    #def transf(self,mu):
    #    """
    #    function: output space -> latent space
    #    """
    #    return np.log(np.exp(mu) - 1)
        g(f) = \\log(\\exp(\\mu) - 1)
    """
    def transf(self,f):
        """
        function: latent space -> output space
        """
        return np.log(1.+np.exp(f))
    def dtransf_df(self,f):
@ -110,9 +116,11 @@ class Reciprocal(GPTransformation):
 class Heaviside(GPTransformation):
    """
-    $$
+
-    g(f) = I_{x \in A}
+    .. math::
-    $$
+
        g(f) = I_{x \\in A}
    """
    def transf(self,f):
        #transformation goes here
--- a/GPy/likelihoods/noise_models/noise_distributions.py
+++ b/GPy/likelihoods/noise_models/noise_distributions.py
@ -16,10 +16,11 @@ class NoiseDistribution(object):
    Likelihood class for doing Expectation propagation
    :param Y: observed output (Nx1 numpy.darray)
-    ..Note:: Y values allowed depend on the LikelihoodFunction used
+
    .. note:: Y values allowed depend on the LikelihoodFunction used
    """
    def __init__(self,gp_link,analytical_mean=False,analytical_variance=False):
-        #assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."#FIXME
+        assert isinstance(gp_link,gp_transformations.GPTransformation), "gp_link is not a valid GPTransformation."
        self.gp_link = gp_link
        self.analytical_mean = analytical_mean
        self.analytical_variance = analytical_variance
@ -50,7 +51,9 @@ class NoiseDistribution(object):
        """
        In case it is needed, this function assess the output values or makes any pertinent transformation on them.
-        :param Y: observed output (Nx1 numpy.darray)
+        :param Y: observed output
        :type Y: Nx1 numpy.darray
        """
        return Y
@ -62,18 +65,21 @@ class NoiseDistribution(object):
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return stats.norm.pdf(gp,loc=mu,scale=sigma) * self._mass(gp,obs)
    def _nlog_product_scaled(self,gp,obs,mu,sigma):
        """
        Negative log-product between the cavity distribution and a likelihood factor.
-        ..Note:: The constant term in the Gaussian distribution is ignored.
+
        .. note:: The constant term in the Gaussian distribution is ignored.
        :param gp: latent variable
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return .5*((gp-mu)/sigma)**2 + self._nlog_mass(gp,obs)
@ -85,6 +91,7 @@ class NoiseDistribution(object):
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 + self._dnlog_mass_dgp(gp,obs)
@ -96,6 +103,7 @@ class NoiseDistribution(object):
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 + self._d2nlog_mass_dgp2(gp,obs)
@ -106,6 +114,7 @@ class NoiseDistribution(object):
        :param obs: observed output
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return sp.optimize.fmin_ncg(self._nlog_product_scaled,x0=mu,fprime=self._dnlog_product_dgp,fhess=self._d2nlog_product_dgp2,args=(obs,mu,sigma),disp=False)
@ -122,6 +131,7 @@ class NoiseDistribution(object):
        :param obs: observed output
        :param tau: cavity distribution 1st natural parameter (precision)
        :param v: cavity distribution 2nd natural paramenter (mu*precision)
        """
        mu = v/tau
        mu_hat = self._product_mode(obs,mu,np.sqrt(1./tau))
@ -137,7 +147,8 @@ class NoiseDistribution(object):
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
-        ..Note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
+        .. note:: This function helps computing E(Y_star) = E(E(Y_star|f_star))
        """
        return .5*((gp - mu)/sigma)**2 - np.log(self._mean(gp))
@ -148,6 +159,7 @@ class NoiseDistribution(object):
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - self._dmean_dgp(gp)/self._mean(gp)
@ -158,6 +170,7 @@ class NoiseDistribution(object):
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - self._d2mean_dgp2(gp)/self._mean(gp) + (self._dmean_dgp(gp)/self._mean(gp))**2
@ -169,7 +182,8 @@ class NoiseDistribution(object):
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
-        ..Note:: This function helps computing E(V(Y_star|f_star))
+        .. note:: This function helps computing E(V(Y_star|f_star))
        """
        return .5*((gp - mu)/sigma)**2 - np.log(self._variance(gp))
@ -180,6 +194,7 @@ class NoiseDistribution(object):
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - self._dvariance_dgp(gp)/self._variance(gp)
@ -190,6 +205,7 @@ class NoiseDistribution(object):
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - self._d2variance_dgp2(gp)/self._variance(gp) + (self._dvariance_dgp(gp)/self._variance(gp))**2
@ -201,7 +217,8 @@ class NoiseDistribution(object):
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
-        ..Note:: This function helps computing E( E(Y_star|f_star)**2 )
+        .. note:: This function helps computing E( E(Y_star|f_star)**2 )
        """
        return .5*((gp - mu)/sigma)**2 - 2*np.log(self._mean(gp))
@ -212,6 +229,7 @@ class NoiseDistribution(object):
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return (gp - mu)/sigma**2 - 2*self._dmean_dgp(gp)/self._mean(gp)
@ -222,6 +240,7 @@ class NoiseDistribution(object):
        :param gp: latent variable
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        return 1./sigma**2 - 2*( self._d2mean_dgp2(gp)/self._mean(gp) - (self._dmean_dgp(gp)/self._mean(gp))**2 )
@ -243,6 +262,7 @@ class NoiseDistribution(object):
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        maximum = sp.optimize.fmin_ncg(self._nlog_conditional_mean_scaled,x0=self._mean(mu),fprime=self._dnlog_conditional_mean_dgp,fhess=self._d2nlog_conditional_mean_dgp2,args=(mu,sigma),disp=False)
        mean = np.exp(-self._nlog_conditional_mean_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_conditional_mean_dgp2(maximum,mu,sigma))*sigma)
@ -266,6 +286,7 @@ class NoiseDistribution(object):
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        """
        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_mean_sq_scaled,x0=self._mean(mu),fprime=self._dnlog_exp_conditional_mean_sq_dgp,fhess=self._d2nlog_exp_conditional_mean_sq_dgp2,args=(mu,sigma),disp=False)
        mean_squared = np.exp(-self._nlog_exp_conditional_mean_sq_scaled(maximum,mu,sigma))/(np.sqrt(self._d2nlog_exp_conditional_mean_sq_dgp2(maximum,mu,sigma))*sigma)
@ -278,6 +299,7 @@ class NoiseDistribution(object):
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
        """
        # E( V(Y_star|f_star) )
        maximum = sp.optimize.fmin_ncg(self._nlog_exp_conditional_variance_scaled,x0=self._variance(mu),fprime=self._dnlog_exp_conditional_variance_dgp,fhess=self._d2nlog_exp_conditional_variance_dgp2,args=(mu,sigma),disp=False)
@ -310,6 +332,7 @@ class NoiseDistribution(object):
        :param mu: cavity distribution mean
        :param sigma: cavity distribution standard deviation
        :predictive_mean: output's predictive mean, if None _predictive_mean function will be called.
        """
        qf = stats.norm.ppf(p,mu,sigma)
        return self.gp_link.transf(qf)
@ -321,6 +344,7 @@ class NoiseDistribution(object):
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        """
        return self._nlog_product_scaled(x[0],x[1],mu,sigma)
@ -331,7 +355,9 @@ class NoiseDistribution(object):
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
-        ..Note: Only avilable when the output is continuous
+
        .. note: Only available when the output is continuous
        """
        assert not self.discrete, "Gradient not available for discrete outputs."
        return np.array((self._dnlog_product_dgp(gp=x[0],obs=x[1],mu=mu,sigma=sigma),self._dnlog_mass_dobs(obs=x[1],gp=x[0])))
@ -343,7 +369,9 @@ class NoiseDistribution(object):
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
-        ..Note: Only avilable when the output is continuous
+
        .. note: Only available when the output is continuous
        """
        assert not self.discrete, "Hessian not available for discrete outputs."
        cross_derivative = self._d2nlog_mass_dcross(gp=x[0],obs=x[1])
@ -356,14 +384,17 @@ class NoiseDistribution(object):
        :param x: tuple (latent variable,output)
        :param mu: latent variable's predictive mean
        :param sigma: latent variable's predictive standard deviation
        """
        return sp.optimize.fmin_ncg(self._nlog_joint_predictive_scaled,x0=(mu,self.gp_link.transf(mu)),fprime=self._gradient_nlog_joint_predictive,fhess=self._hessian_nlog_joint_predictive,args=(mu,sigma),disp=False)
    def predictive_values(self,mu,var):
        """
-        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction
+        Compute  mean, variance and conficence interval (percentiles 5 and 95) of the  prediction.
        :param mu: mean of the latent variable
        :param var: variance of the latent variable
        """
        if isinstance(mu,float) or isinstance(mu,int):
            mu = [mu]
--- a/GPy/likelihoods/noise_models/poisson_noise.py
+++ b/GPy/likelihoods/noise_models/poisson_noise.py
@ -12,29 +12,22 @@ from noise_distributions import NoiseDistribution
 class Poisson(NoiseDistribution):
    """
    Poisson likelihood
-    Y is expected to take values in {0,1,2,...}
+
-    -----
+    .. math::
-    $$
+        L(x) = \\exp(\\lambda) * \\frac{\\lambda^Y_i}{Y_i!}
-    L(x) = \exp(\lambda) * \lambda**Y_i / Y_i!
+
-    $$
+    ..Note: Y is expected to take values in {0,1,2,...}
    """
    def __init__(self,gp_link=None,analytical_mean=False,analytical_variance=False):
        #self.discrete = True
        #self.support_limits = (0,np.inf)
        #self.analytical_mean = False
        super(Poisson, self).__init__(gp_link,analytical_mean,analytical_variance)
    def _preprocess_values(self,Y): #TODO
-        #self.scale = .5*Y.max()
+        return Y
        #self.shift = Y.mean()
        return Y #(Y - self.shift)/self.scale
    def _mass(self,gp,obs):
        """
        Mass (or density) function
        """
        #obs = obs*self.scale + self.shift
        return stats.poisson.pmf(obs,self.gp_link.transf(gp))
    def _nlog_mass(self,gp,obs):
@ -51,15 +44,6 @@ class Poisson(NoiseDistribution):
        transf = self.gp_link.transf(gp)
        return obs * ((self.gp_link.dtransf_df(gp)/transf)**2 - d2_df/transf) + d2_df
    def _dnlog_mass_dobs(self,obs,gp): #TODO not needed
        return special.psi(obs+1) -  np.log(self.gp_link.transf(gp))
    def _d2nlog_mass_dobs2(self,obs,gp=None): #TODO not needed
        return special.polygamma(1,obs)
    def _d2nlog_mass_dcross(self,obs,gp): #TODO not needed
        return -self.gp_link.dtransf_df(gp)/self.gp_link.transf(gp)
    def _mean(self,gp):
        """
        Mass (or density) function
--- a/GPy/mappings/mlp.py
+++ b/GPy/mappings/mlp.py
@ -10,11 +10,13 @@ class MLP(Mapping):
    .. math::
-       f(\mathbf{x}*) = \mathbf{W}^0\boldsymbol{\phi}(\mathbf{W}^1\mathbf{x}+\mathb{b}^1)^* + \mathbf{b}^0
+       f(\\mathbf{x}*) = \\mathbf{W}^0\\boldsymbol{\\phi}(\\mathbf{W}^1\\mathbf{x}+\\mathbf{b}^1)^* + \\mathbf{b}^0
    where
-    ..math::
+
-      \phi(\cdot) = \text{tanh}(\cdot)
+    .. math::
      \\phi(\\cdot) = \\text{tanh}(\\cdot)
    :param X: input observations
    :type X: ndarray
@ -22,6 +24,7 @@ class MLP(Mapping):
    :type output_dim: int
    :param hidden_dim: dimension of hidden layer. If it is an int, there is one hidden layer of the given dimension. If it is a list of ints there are as manny hidden layers as the length of the list, each with the given number of hidden nodes in it.
    :type hidden_dim: int or list of ints. 
    """
    def __init__(self, input_dim=1, output_dim=1, hidden_dim=3):
--- a/GPy/models/bayesian_gplvm.py
+++ b/GPy/models/bayesian_gplvm.py
@ -8,7 +8,7 @@ from .. import kern
 import itertools
 from matplotlib.colors import colorConverter
 from GPy.inference.optimization import SCG
-from GPy.util import plot_latent
+from GPy.util import plot_latent, linalg
 from GPy.models.gplvm import GPLVM
 from GPy.util.plot_latent import most_significant_input_dimensions
 from matplotlib import pyplot
@ -140,12 +140,20 @@ class BayesianGPLVM(SparseGP, GPLVM):
        dpsi0 = -0.5 * self.input_dim * self.likelihood.precision
        dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
        V = self.likelihood.precision * Y
        #compute CPsi1V
        if self.Cpsi1V is None:
            psi1V = np.dot(self.psi1.T, self.likelihood.V)
            tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
            tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
            self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)
        dpsi1 = np.dot(self.Cpsi1V, V.T)
        start = np.zeros(self.input_dim * 2)
        for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
-            args = (self.kern, self.Z, dpsi0, dpsi1_n, dpsi2)
+            args = (self.kern, self.Z, dpsi0, dpsi1_n.T, dpsi2)
            xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
            mu, log_S = xopt.reshape(2, 1, -1)
@ -237,12 +245,13 @@ class BayesianGPLVM(SparseGP, GPLVM):
        """
        Plot latent space X in 1D:
-            -if fig is given, create input_dim subplots in fig and plot in these
+            - if fig is given, create input_dim subplots in fig and plot in these
-            -if ax is given plot input_dim 1D latent space plots of X into each `axis`
+            - if ax is given plot input_dim 1D latent space plots of X into each `axis`
-            -if neither fig nor ax is given create a figure with fignum and plot in there
+            - if neither fig nor ax is given create a figure with fignum and plot in there
        colors:
            colors of different latent space dimensions input_dim
        """
        import pylab
        if ax is None:
--- a/GPy/models/gp_multioutput_regression.py
+++ b/GPy/models/gp_multioutput_regression.py
@ -25,14 +25,14 @@ class GPMultioutputRegression(GP):
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
-    :param W_columns: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
+    :param rank: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
-    :type W_columns: integer
+    :type rank: integer
    """
-    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,W_columns=1):
+    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,rank=1):
-        self.num_outputs = len(Y_list)
+        self.output_dim = len(Y_list)
-        assert len(X_list) == self.num_outputs, 'Number of outputs do not match length of inputs list.'
+        assert len(X_list) == self.output_dim, 'Number of outputs do not match length of inputs list.'
        #Inputs indexing
        i = 0
@ -51,7 +51,7 @@ class GPMultioutputRegression(GP):
        #Coregionalization kernel definition
        if kernel_list is None:
            kernel_list = [kern.rbf(original_dim)]
-        mkernel = kern.build_lcm(input_dim=original_dim, num_outputs=self.num_outputs, kernel_list = kernel_list, W_columns=W_columns)
+        mkernel = kern.build_lcm(input_dim=original_dim, output_dim=self.output_dim, kernel_list = kernel_list, rank=rank)
        self.multioutput = True
        GP.__init__(self, X, likelihood, mkernel, normalize_X=normalize_X)
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -39,6 +39,7 @@ class MRD(Model):
    :param num_inducing: number of inducing inputs to use
    :param kernels: list of kernels or kernel shared for all BGPLVMS
    :type kernels: [GPy.kern.kern] | GPy.kern.kern | None (default)
    """
    def __init__(self, likelihood_or_Y_list, input_dim, num_inducing=10, names=None,
                 kernels=None, initx='PCA',
@ -338,8 +339,11 @@ class MRD(Model):
    def plot_scales(self, fignum=None, ax=None, titles=None, sharex=False, sharey=True, *args, **kwargs):
        """
-        :param:`titles` :
+
-            titles for axes of datasets
+        TODO: Explain other parameters
        :param titles: titles for axes of datasets
        """
        if titles is None:
            titles = [r'${}$'.format(name) for name in self.names]
--- a/GPy/models/sparse_gp_multioutput_regression.py
+++ b/GPy/models/sparse_gp_multioutput_regression.py
@ -30,23 +30,23 @@ class SparseGPMultioutputRegression(SparseGP):
    :type Z_list: list of numpy arrays (num_inducing_output_i x input_dim), one array per output | empty list
    :param num_inducing: number of inducing inputs per output, defaults to 10 (ignored if Z_list is not empty)
    :type num_inducing: integer
-    :param W_columns: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
+    :param rank: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
-    :type W_columns: integer
+    :type rank: integer
    """
    #NOTE not tested with uncertain inputs
-    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,Z_list=[],num_inducing=10,W_columns=1):
+    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,Z_list=[],num_inducing=10,rank=1):
-        self.num_outputs = len(Y_list)
+        self.output_dim = len(Y_list)
-        assert len(X_list) == self.num_outputs, 'Number of outputs do not match length of inputs list.'
+        assert len(X_list) == self.output_dim, 'Number of outputs do not match length of inputs list.'
        #Inducing inputs list
        if len(Z_list):
-            assert len(Z_list) == self.num_outputs, 'Number of outputs do not match length of inducing inputs list.'
+            assert len(Z_list) == self.output_dim, 'Number of outputs do not match length of inducing inputs list.'
        else:
            if isinstance(num_inducing,np.int):
-                num_inducing = [num_inducing] * self.num_outputs
+                num_inducing = [num_inducing] * self.output_dim
            num_inducing = np.asarray(num_inducing)
-            assert num_inducing.size == self.num_outputs, 'Number of outputs do not match length of inducing inputs list.'
+            assert num_inducing.size == self.output_dim, 'Number of outputs do not match length of inducing inputs list.'
            for ni,X in zip(num_inducing,X_list):
                i = np.random.permutation(X.shape[0])[:ni]
                Z_list.append(X[i].copy())
@ -72,7 +72,7 @@ class SparseGPMultioutputRegression(SparseGP):
        #Coregionalization kernel definition
        if kernel_list is None:
            kernel_list = [kern.rbf(original_dim)]
-        mkernel = kern.build_lcm(input_dim=original_dim, num_outputs=self.num_outputs, kernel_list = kernel_list, W_columns=W_columns)
+        mkernel = kern.build_lcm(input_dim=original_dim, output_dim=self.output_dim, kernel_list = kernel_list, rank=rank)
        self.multioutput = True
        SparseGP.__init__(self, X, likelihood, mkernel, Z=Z, normalize_X=normalize_X)
--- a/GPy/testing/bcgplvm_tests.py
+++ b/GPy/testing/bcgplvm_tests.py
@ -0,0 +1,50 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt)
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import unittest
 import numpy as np
 import GPy
 class BCGPLVMTests(unittest.TestCase):
    def test_kernel_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
        X = np.random.rand(num_data, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
        k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
        bk = GPy.kern.rbf(output_dim)
        mapping = GPy.mappings.Kernel(output_dim=input_dim, X=Y, kernel=bk)
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
        self.assertTrue(m.checkgrad())
    def test_linear_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
        X = np.random.rand(num_data, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
        k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
        bk = GPy.kern.rbf(output_dim)
        mapping = GPy.mappings.Linear(output_dim=input_dim, input_dim=output_dim)
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
        self.assertTrue(m.checkgrad())
    def test_mlp_backconstraint(self):
        num_data, num_inducing, input_dim, output_dim = 10, 3, 2, 4
        X = np.random.rand(num_data, input_dim)
        k = GPy.kern.rbf(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(num_data),K,output_dim).T
        k = GPy.kern.mlp(input_dim) + GPy.kern.bias(input_dim)
        bk = GPy.kern.rbf(output_dim)
        mapping = GPy.mappings.MLP(output_dim=input_dim, input_dim=output_dim, hidden_dim=[5, 4, 7])
        m = GPy.models.BCGPLVM(Y, input_dim, kernel = k, mapping=mapping)
        m.randomize()
        self.assertTrue(m.checkgrad())
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
    unittest.main()
--- a/GPy/testing/bgplvm_tests.py
+++ b/GPy/testing/bgplvm_tests.py
@ -55,7 +55,18 @@ class BGPLVMTests(unittest.TestCase):
        m.randomize()
        self.assertTrue(m.checkgrad())
-    #@unittest.skip('psi2 cross terms are NotImplemented for this combination')
+    def test_rbf_line_kern(self):
        N, num_inducing, input_dim, D = 10, 3, 2, 4
        X = np.random.rand(N, input_dim)
        k = GPy.kern.rbf(input_dim) +  GPy.kern.linear(input_dim) + GPy.kern.white(input_dim, 0.00001)
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N),K,input_dim).T
        Y -= Y.mean(axis=0)
        k = GPy.kern.rbf(input_dim) + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001)
        m = BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing)
        m.randomize()
        self.assertTrue(m.checkgrad())
    def test_linear_bias_kern(self):
        N, num_inducing, input_dim, D = 30, 5, 4, 30
        X = np.random.rand(N, input_dim)
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -21,6 +21,10 @@ class KernelTests(unittest.TestCase):
        kern = GPy.kern.rbf(5)
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
    def test_rbf_sympykernel(self):
        kern = GPy.kern.rbf_sympy(5)
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
    def test_rbf_invkernel(self):
        kern = GPy.kern.rbf_inv(5)
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
@ -79,19 +83,19 @@ class KernelTests(unittest.TestCase):
        kern = GPy.kern.poly(5, degree=4)
        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
-    def test_coregionalization(self):
+    # def test_coregionalization(self):
-        X1 = np.random.rand(50,1)*8
+    #     X1 = np.random.rand(50,1)*8
-        X2 = np.random.rand(30,1)*5
+    #     X2 = np.random.rand(30,1)*5
-        index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
+    #     index = np.vstack((np.zeros_like(X1),np.ones_like(X2)))
-        X = np.hstack((np.vstack((X1,X2)),index))
+    #     X = np.hstack((np.vstack((X1,X2)),index))
-        Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
+    #     Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05
-        Y2 = np.sin(X2) + np.random.randn(*X2.shape)*0.05 + 2.
+    #     Y2 = np.sin(X2) + np.random.randn(*X2.shape)*0.05 + 2.
-        Y = np.vstack((Y1,Y2))
+    #     Y = np.vstack((Y1,Y2))
-        k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
+    #     k1 = GPy.kern.rbf(1) + GPy.kern.bias(1)
-        k2 = GPy.kern.coregionalize(2,1)
+    #     k2 = GPy.kern.coregionalize(2,1)
-        kern = k1**k2
+    #     kern = k1**k2
-        self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
+    #     self.assertTrue(GPy.kern.kern_test(kern, verbose=verbose))
 if __name__ == "__main__":
--- a/GPy/testing/psi_stat_expectation_tests.py
+++ b/GPy/testing/psi_stat_expectation_tests.py
@ -105,7 +105,7 @@ class Test(unittest.TestCase):
    def test_psi2(self):
        for kern in self.kerns:
-            Nsamples = self.Nsamples/300.
+            Nsamples = self.Nsamples/10.
            psi2 = kern.psi2(self.Z, self.q_x_mean, self.q_x_variance)
            K_ = np.zeros((self.num_inducing, self.num_inducing))
            diffs = []
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -238,6 +238,18 @@ class GradientTests(unittest.TestCase):
        m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())
    def multioutput_sparse_regression_1D(self):
        X1 = np.random.rand(500, 1) * 8
        X2 = np.random.rand(300, 1) * 5
        X = np.vstack((X1, X2))
        Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05
        Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05
        Y = np.vstack((Y1, Y2))
        k1 = GPy.kern.rbf(1)
        m = GPy.models.SparseGPMultioutputRegression(X_list=[X1,X2],Y_list=[Y1,Y2],kernel_list=[k1])
        m.constrain_fixed('.*rbf_var', 1.)
        self.assertTrue(m.checkgrad())
 if __name__ == "__main__":
    print "Running unit tests, please be (very) patient..."
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -7,9 +7,21 @@ import urllib as url
 import zipfile
 import tarfile
 import datetime
-    
+
 ipython_notebook = False
 if ipython_notebook:
    import IPython.core.display
    def ipynb_input(varname, prompt=''):
        """Prompt user for input and assign string val to given variable name."""
        js_code = ("""
            var value = prompt("{prompt}","");
            var py_code = "{varname} = '" + value + "'";
            IPython.notebook.kernel.execute(py_code);
        """).format(prompt=prompt, varname=varname)
        return IPython.core.display.Javascript(js_code)
 import sys, urllib
 def reporthook(a,b,c): 
    # ',' at the end of the line is important!
    #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c),
@ -130,14 +142,18 @@ The database was created with funding from NSF EIA-0196217.""",
                                        'license' : None,
                                        'size' : 24229368},
                  }
-                  
+
 def prompt_user():
    """Ask user for agreeing to data set licenses."""
    # raw_input returns the empty string for "enter"
    yes = set(['yes', 'y'])
    no = set(['no','n'])
-
+    choice = ''
-    choice = raw_input().lower()
+    if ipython_notebook:
        ipynb_input(choice, prompt='provide your answer here')
    else:
        choice = raw_input().lower()
    if choice in yes:
        return True
    elif choice in no:
@ -146,6 +162,7 @@ def prompt_user():
        sys.stdout.write("Please respond with 'yes', 'y' or 'no', 'n'")
        return prompt_user()
 def data_available(dataset_name=None):
    """Check if the data set is available on the local machine already."""
    for file_list in data_resources[dataset_name]['files']:
@ -524,11 +541,14 @@ def simulation_BGPLVM():
            'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"}
 def toy_rbf_1d(seed=default_seed, num_samples=500):
-    """Samples values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1.
+    """
    Samples values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1.
    :param seed: seed to use for random sampling.
    :type seed: int
    :param num_samples: number of samples to sample in the function (default 500).
    :type num_samples: int
    """
    np.random.seed(seed=seed)
    num_in = 1
@ -631,11 +651,15 @@ def olympic_marathon_men(data_set='olympic_marathon_men'):
 def crescent_data(num_data=200, seed=default_seed):
-    """Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem.
+    """
 Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem.
    :param num_data_part: number of data to be sampled (default is 200).
    :type num_data: int
    :param seed: random seed to be used for data generation.
-    :type seed: int"""
+    :type seed: int
    """
    np.random.seed(seed=seed)
    sqrt2 = np.sqrt(2)
    # Rotation matrix
--- a/GPy/util/erfcx.py
+++ b/GPy/util/erfcx.py
@ -0,0 +1,63 @@
 ## Copyright (C) 2010 Soren Hauberg
 ##
 ## Copyright James Hensman 2011
 ## 
 ## This program is free software; you can redistribute it and/or modify it
 ## under the terms of the GNU General Public License as published by
 ## the Free Software Foundation; either version 3 of the License, or (at
 ## your option) any later version.
 ##
 ## This program is distributed in the hope that it will be useful, but
 ## WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 ## General Public License for more details.
 ##
 ## You should have received a copy of the GNU General Public License
 ## along with this program; see the file COPYING.  If not, see
 ## <http://www.gnu.org/licenses/>.
 import numpy as np
 def erfcx (arg):
 	arg = np.atleast_1d(arg)
 	assert(np.all(np.isreal(arg)),"erfcx: input must be real")
 	## Get precision dependent thresholds -- or not :p
 	xneg = -26.628;
 	xmax = 2.53e+307;
 	## Allocate output
 	result = np.zeros (arg.shape)
 	## Find values where erfcx can be evaluated
 	idx_neg = (arg < xneg);
 	idx_max = (arg > xmax);
 	idx = ~(idx_neg | idx_max);
 	arg = arg [idx];
 	## Perform the actual computation
 	t = 3.97886080735226 / (np.abs (arg) + 3.97886080735226);
 	u = t - 0.5;
 	y = (((((((((u * 0.00127109764952614092 + 1.19314022838340944e-4) * u \
 	    - 0.003963850973605135)   * u - 8.70779635317295828e-4) * u +     \
 	      0.00773672528313526668) * u + 0.00383335126264887303) * u -     \
 	      0.0127223813782122755)  * u - 0.0133823644533460069)  * u +     \
 	      0.0161315329733252248)  * u + 0.0390976845588484035)  * u +     \
 	      0.00249367200053503304;
 	y = ((((((((((((y * u - 0.0838864557023001992) * u -           \
 	      0.119463959964325415) * u + 0.0166207924969367356) * u + \
 	      0.357524274449531043) * u + 0.805276408752910567)  * u + \
 	      1.18902982909273333)  * u + 1.37040217682338167)   * u + \
 	      1.31314653831023098)  * u + 1.07925515155856677)   * u + \
 	      0.774368199119538609) * u + 0.490165080585318424)  * u + \
 	      0.275374741597376782) * t;
 	y [arg < 0] = 2 * np.exp (arg [arg < 0]**2) - y [arg < 0];
 	## Put the results back into something with the same size is the original input
 	result [idx] = y;
 	result [idx_neg] = np.inf;
 	## result (idx_max) = 0; # not needed as we initialise with zeros
 	return(result)
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -27,48 +27,56 @@ except:
    _blas_available = False
 def dtrtrs(A, B, lower=0, trans=0, unitdiag=0):
-    """Wrapper for lapack dtrtrs function
+    """
    Wrapper for lapack dtrtrs function
    :param A: Matrix A
    :param B: Matrix B
    :param lower: is matrix lower (true) or upper (false)
    :returns:
    """
    return lapack.dtrtrs(A, B, lower=lower, trans=trans, unitdiag=unitdiag)
 def dpotrs(A, B, lower=0):
-    """Wrapper for lapack dpotrs function
+    """
    Wrapper for lapack dpotrs function
    :param A: Matrix A
    :param B: Matrix B
    :param lower: is matrix lower (true) or upper (false)
    :returns:
    """
    return lapack.dpotrs(A, B, lower=lower)
 def dpotri(A, lower=0):
-    """Wrapper for lapack dpotri function
+    """
    Wrapper for lapack dpotri function
    :param A: Matrix A
    :param lower: is matrix lower (true) or upper (false)
    :returns: A inverse
    """
    return lapack.dpotri(A, lower=lower)
 def trace_dot(a, b):
    """
-    efficiently compute the trace of the matrix product of a and b
+    Efficiently compute the trace of the matrix product of a and b
    """
    return np.sum(a * b)
 def mdot(*args):
-    """Multiply all the arguments using matrix product rules.
+    """
    Multiply all the arguments using matrix product rules.
    The output is equivalent to multiplying the arguments one by one
    from left to right using dot().
    Precedence can be controlled by creating tuples of arguments,
    for instance mdot(a,((b,c),d)) multiplies a (a*((b*c)*d)).
    Note that this means the output of dot(a,b) and mdot(a,b) will differ if
    a or b is a pure tuple of numbers.
    """
    if len(args) == 1:
        return args[0]
@ -115,14 +123,16 @@ def jitchol(A, maxtries=5):
 def jitchol_old(A, maxtries=5):
    """
-    :param A : An almost pd square matrix
+    :param A: An almost pd square matrix
    :rval L: the Cholesky decomposition of A
-    .. Note:
+    .. note:
      Adds jitter to K, to enforce positive-definiteness
      if stuff breaks, please check:
      np.allclose(sp.linalg.cholesky(XXT, lower = True), np.triu(sp.linalg.cho_factor(XXT)[0]).T)
    """
    try:
        return linalg.cholesky(A, lower=True)
@ -142,6 +152,7 @@ def jitchol_old(A, maxtries=5):
 def pdinv(A, *args):
    """
    :param A: A DxD pd numpy array
    :rval Ai: the inverse of A
@ -152,6 +163,7 @@ def pdinv(A, *args):
    :rtype Li: np.ndarray
    :rval logdet: the log of the determinant of A
    :rtype logdet: float64
    """
    L = jitchol(A, *args)
    logdet = 2.*np.sum(np.log(np.diag(L)))
@ -177,14 +189,13 @@ def chol_inv(L):
 def multiple_pdinv(A):
    """
    Arguments
    ---------
    :param A: A DxDxN numpy array (each A[:,:,i] is pd)
-    Returns
+    :rval invs: the inverses of A
-    -------
+    :rtype invs: np.ndarray
-    invs : the inverses of A
+    :rval hld: 0.5* the log of the determinants of A
-    hld: 0.5* the log of the determinants of A
+    :rtype hld: np.array
    """
    N = A.shape[-1]
    chols = [jitchol(A[:, :, i]) for i in range(N)]
@ -198,15 +209,13 @@ def PCA(Y, input_dim):
    """
    Principal component analysis: maximum likelihood solution by SVD
    Arguments
    ---------
    :param Y: NxD np.array of data
    :param input_dim: int, dimension of projection
-    Returns
+
    -------
    :rval X: - Nxinput_dim np.array of dimensionality reduced data
-    W - input_dimxD mapping from X to Y
+    :rval W: - input_dimxD mapping from X to Y
    """
    if not np.allclose(Y.mean(axis=0), 0.0):
        print "Y is not zero mean, centering it locally (GPy.util.linalg.PCA)"
@ -273,11 +282,10 @@ def DSYR_blas(A, x, alpha=1.):
    Performs a symmetric rank-1 update operation:
    A <- A + alpha * np.dot(x,x.T)
    Arguments
    ---------
    :param A: Symmetric NxN np.array
    :param x: Nx1 np.array
    :param alpha: scalar
    """
    N = c_int(A.shape[0])
    LDA = c_int(A.shape[0])
@ -295,11 +303,10 @@ def DSYR_numpy(A, x, alpha=1.):
    Performs a symmetric rank-1 update operation:
    A <- A + alpha * np.dot(x,x.T)
    Arguments
    ---------
    :param A: Symmetric NxN np.array
    :param x: Nx1 np.array
    :param alpha: scalar
    """
    A += alpha * np.dot(x[:, None], x[None, :])
@ -363,8 +370,9 @@ def cholupdate(L, x):
    """
    update the LOWER cholesky factor of a pd matrix IN PLACE
-    if L is the lower chol. of K, then this function computes L_
+    if L is the lower chol. of K, then this function computes L\_
-    where L_ is the lower chol of K + x*x^T
+    where L\_ is the lower chol of K + x*x^T
    """
    support_code = """
    #include <math.h>
--- a/GPy/util/ln_diff_erfs.py
+++ b/GPy/util/ln_diff_erfs.py
@ -0,0 +1,110 @@
 # Copyright (c) 2013, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 #Only works for scipy 0.12+
 try:
    from scipy.special import erfcx, erf
 except ImportError:
    from scipy.special import erf
    from erfcx import erfcx
 import numpy as np
 def ln_diff_erfs(x1, x2, return_sign=False):
    """Function for stably computing the log of difference of two erfs in a numerically stable manner.
    :param x1 : argument of the positive erf
    :type x1: ndarray
    :param x2 : argument of the negative erf
    :type x2: ndarray
    :return: tuple containing (log(abs(erf(x1) - erf(x2))), sign(erf(x1) - erf(x2)))
    Based on MATLAB code that was written by Antti Honkela and modified by David Luengo and originally derived from code by Neil Lawrence.
    """
    x1 = np.require(x1).real
    x2 = np.require(x2).real
    if x1.size==1:
        x1 = np.reshape(x1, (1, 1))
    if x2.size==1:
        x2 = np.reshape(x2, (1, 1))
    if x1.shape==x2.shape:
        v = np.zeros_like(x1)
    else:
        if x1.size==1:
            v = np.zeros(x2.shape)
        elif x2.size==1:
            v = np.zeros(x1.shape)
        else:
            raise ValueError, "This function does not broadcast unless provided with a scalar."
    if x1.size == 1:
        x1 = np.tile(x1, x2.shape)
    if x2.size == 1:
        x2 = np.tile(x2, x1.shape)
    sign = np.sign(x1 - x2)
    if x1.size == 1:
        if sign== -1:
            swap = x1
            x1 = x2
            x2 = swap
    else:
        I = sign == -1
        swap = x1[I]
        x1[I] = x2[I]
        x2[I] = swap
    with np.errstate(divide='ignore'):
        # switch off log of zero warnings.
        # Case 0: arguments of different sign, no problems with loss of accuracy
        I0 = np.logical_or(np.logical_and(x1>0, x2<0), np.logical_and(x2>0, x1<0)) # I1=(x1*x2)<0
        # Case 1: x1 = x2 so we have log of zero.
        I1 = (x1 == x2)
        # Case 2: Both arguments are non-negative
        I2 = np.logical_and(x1 > 0, np.logical_and(np.logical_not(I0),
                                                   np.logical_not(I1)))
        # Case 3: Both arguments are non-positive
        I3 = np.logical_and(np.logical_and(np.logical_not(I0),
                                           np.logical_not(I1)),
                            np.logical_not(I2))
        _x2 = x2.flatten()
        _x1 = x1.flatten()
        for group, flags in zip((0, 1, 2, 3), (I0, I1, I2, I3)):
            if np.any(flags):
                if not x1.size==1:
                    _x1 = x1[flags]
                if not x2.size==1:
                    _x2 = x2[flags]
                if group==0:
                    v[flags] = np.log( erf(_x1) - erf(_x2) )
                elif group==1:
                    v[flags] = -np.inf
                elif group==2:
                    v[flags] = np.log(erfcx(_x2)
                                   -erfcx(_x1)*np.exp(_x2**2
                                                      -_x1**2)) - _x2**2
                elif group==3:
                    v[flags] = np.log(erfcx(-_x1)
                                   -erfcx(-_x2)*np.exp(_x1**2
                                                          -_x2**2))-_x1**2
    # TODO: switch back on log of zero warnings.
    if return_sign:
        return v, sign
    else:
        if v.size==1:
            if sign==-1:
                v = v.view('complex64')
                v += np.pi*1j
        else:
            # Need to add in a complex part because argument is negative.
            v = v.view('complex64')
            v[I] += np.pi*1j
    return v
--- a/GPy/util/misc.py
+++ b/GPy/util/misc.py
@ -17,12 +17,9 @@ def linear_grid(D, n = 100, min_max = (-100, 100)):
    """
    Creates a D-dimensional grid of n linearly spaced points
-    Parameters:
+    :param D: dimension of the grid
-
+    :param n: number of points
-    D:        dimension of the grid
+    :param min_max: (min, max) list
    n:        number of points
    min_max:  (min, max) list
    """
@ -39,6 +36,7 @@ def kmm_init(X, m = 10):
    :param X: data
    :param m: number of inducing points
    """
    # compute the distances
--- a/GPy/util/mocap.py
+++ b/GPy/util/mocap.py
@ -92,13 +92,15 @@ class tree:
    def swap_vertices(self, i, j):
-        """Swap two vertices in the tree structure array.
+        """
        Swap two vertices in the tree structure array.
        swap_vertex swaps the location of two vertices in a tree structure array. 
-         ARG tree : the tree for which two vertices are to be swapped.
+
-         ARG i : the index of the first vertex to be swapped.
+        :param tree: the tree for which two vertices are to be swapped.
-         ARG j : the index of the second vertex to be swapped.
+        :param i: the index of the first vertex to be swapped.
-         RETURN tree : the tree structure with the two vertex locations
+        :param j: the index of the second vertex to be swapped.
-         swapped.
+        :rval tree: the tree structure with the two vertex locations swapped.
        """
        store_vertex_i = self.vertices[i]
        store_vertex_j = self.vertices[j]
@ -117,12 +119,17 @@ class tree:
 def rotation_matrix(xangle, yangle, zangle, order='zxy', degrees=False):
-    """Compute the rotation matrix for an angle in each direction.
+    """
    Compute the rotation matrix for an angle in each direction.
    This is a helper function for computing the rotation matrix for a given set of angles in a given order.
-     ARG xangle : rotation for x-axis.
+
-     ARG yangle : rotation for y-axis.
+    :param xangle: rotation for x-axis.
-     ARG zangle : rotation for z-axis.
+    :param yangle: rotation for y-axis.
-     ARG order : the order for the rotations."""
+    :param zangle: rotation for z-axis.
    :param order: the order for the rotations.
     """
    if degrees:
        xangle = math.radians(xangle)
        yangle = math.radians(yangle)
@ -301,10 +308,12 @@ class acclaim_skeleton(skeleton):
    def load_skel(self, file_name):
-        """Loads an ASF file into a skeleton structure.
+        """
-        loads skeleton structure from an acclaim skeleton file.
+        Loads an ASF file into a skeleton structure.
-         ARG file_name : the file name to load in.
+
-         RETURN skel : the skeleton for the file."""         
+        :param file_name: The file name to load in.
         """         
        fid = open(file_name, 'r')
        self.read_skel(fid)
--- a/GPy/util/plot_latent.py
+++ b/GPy/util/plot_latent.py
@ -15,7 +15,7 @@ def most_significant_input_dimensions(model, which_indices):
            try:
                input_1, input_2 = np.argsort(model.input_sensitivity())[::-1][:2]
            except:
-                raise ValueError, "cannot Atomatically determine which dimensions to plot, please pass 'which_indices'"
+                raise ValueError, "cannot automatically determine which dimensions to plot, please pass 'which_indices'"
    else:
        input_1, input_2 = which_indices
    return input_1, input_2
--- a/GPy/util/symbolic.py
+++ b/GPy/util/symbolic.py
@ -0,0 +1,32 @@
 from sympy import Function, S, oo, I, cos, sin
 class sinc_grad(Function):
    nargs = 1
    def fdiff(self, argindex=1):
        return ((2-x*x)*sin(self.args[0]) - 2*x*cos(x))/(x*x*x)
    @classmethod
    def eval(cls, x):
        if x is S.Zero:
            return S.Zero
        else:
            return (x*cos(x) - sin(x))/(x*x)
 class sinc(Function):
    nargs = 1
    def fdiff(self, argindex=1):
        return sinc_grad(self.args[0])
    @classmethod
    def eval(cls, x):
        if x is S.Zero:
            return S.One
        else:
            return sin(x)/x
    def _eval_is_real(self):
        return self.args[0].is_real
--- a/GPy/util/visualize.py
+++ b/GPy/util/visualize.py
@ -502,11 +502,14 @@ def data_play(Y, visualizer, frame_rate=30):
    This example loads in the CMU mocap database (http://mocap.cs.cmu.edu) subject number 35 motion number 01. It then plays it using the mocap_show visualize object.
-    data = GPy.util.datasets.cmu_mocap(subject='35', train_motions=['01'])
+    .. code-block:: python
-    Y = data['Y']
+
-    Y[:, 0:3] = 0.   # Make figure walk in place
+       data = GPy.util.datasets.cmu_mocap(subject='35', train_motions=['01'])
-    visualize = GPy.util.visualize.skeleton_show(Y[0, :], data['skel'])
+       Y = data['Y']
-    GPy.util.visualize.data_play(Y, visualize)
+       Y[:, 0:3] = 0.   # Make figure walk in place
       visualize = GPy.util.visualize.skeleton_show(Y[0, :], data['skel'])
       GPy.util.visualize.data_play(Y, visualize)
    """
--- a/GPy/util/warping_functions.py
+++ b/GPy/util/warping_functions.py
@ -53,9 +53,11 @@ class TanhWarpingFunction(WarpingFunction):
        self.num_parameters = 3 * self.n_terms
    def f(self,y,psi):
-        """transform y with f using parameter vector psi
+        """
        transform y with f using parameter vector psi
        psi = [[a,b,c]]
-        f = \sum_{terms} a * tanh(b*(y+c))
+        ::math::`f = \\sum_{terms} a * tanh(b*(y+c))`
        """
        #1. check that number of params is consistent
@ -77,8 +79,7 @@ class TanhWarpingFunction(WarpingFunction):
        """
        calculate the numerical inverse of f
-        == input ==
+        :param iterations: number of N.R. iterations
        iterations: number of N.R. iterations
        """
@ -165,9 +166,11 @@ class TanhWarpingFunction_d(WarpingFunction):
        self.num_parameters = 3 * self.n_terms + 1
    def f(self,y,psi):
-        """transform y with f using parameter vector psi
+        """
        Transform y with f using parameter vector psi
        psi = [[a,b,c]]
-        f = \sum_{terms} a * tanh(b*(y+c))
+
        :math:`f = \\sum_{terms} a * tanh(b*(y+c))`
        """
        #1. check that number of params is consistent
@ -189,8 +192,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        """
        calculate the numerical inverse of f
-        == input ==
+        :param max_iterations: maximum number of N.R. iterations
        iterations: number of N.R. iterations
        """
@ -214,12 +216,13 @@ class TanhWarpingFunction_d(WarpingFunction):
    def fgrad_y(self, y, psi, return_precalc = False):
        """
        gradient of f w.r.t to y ([N x 1])
-        returns: Nx1 vector of derivatives, unless return_precalc is true,
+
-        then it also returns the precomputed stuff
+        :returns: Nx1 vector of derivatives, unless return_precalc is true, then it also returns the precomputed stuff
        """
-        mpsi = psi.copy()
+        mpsi = psi.coSpy()
        d = psi[-1]
        mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
@ -242,7 +245,7 @@ class TanhWarpingFunction_d(WarpingFunction):
        """
        gradient of f w.r.t to y and psi
-        returns: NxIx4 tensor of partial derivatives
+        :returns: NxIx4 tensor of partial derivatives
        """
--- a/doc/Makefile
+++ b/doc/Makefile
@ -2,7 +2,7 @@
 #
 # You can set these variables from the command line.
-SPHINXOPTS    =
+SPHINXOPTS    = -a -w log.txt -E
 SPHINXBUILD   = sphinx-build
 PAPER         =
 BUILDDIR      = _build
--- a/doc/conf.py
+++ b/doc/conf.py
@ -106,7 +106,7 @@ class Mock(object):
 print "Mocking"
 MOCK_MODULES = ['sympy',
    'sympy.utilities', 'sympy.utilities.codegen', 'sympy.core.cache',
-    'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser'
+    'sympy.core', 'sympy.parsing', 'sympy.parsing.sympy_parser', 'Tango', 'numdifftools'
    ]
 for mod_name in MOCK_MODULES:
    sys.modules[mod_name] = Mock()
--- a/doc/tuto_interacting_with_models.rst
+++ b/doc/tuto_interacting_with_models.rst
@ -20,7 +20,7 @@ All of the examples included in GPy return an instance
 of a model class, and therefore they can be called in 
 the following way: ::
-	import numpy as np
+    import numpy as np
    import pylab as pb
    pb.ion()
    import GPy
@ -107,7 +107,7 @@ inputs: ::
 	m['iip'] = np.arange(-5,0)
 Getting the model's likelihood and gradients
-===========================================
+=============================================
 Appart form the printing the model,  the marginal 
 log-likelihood can be obtained by using the function
 ``log_likelihood()``. Also, the log-likelihood gradients