Merge branch 'devel' of github.com:SheffieldML/GPy into devel

2026-07-20 16:51:05 +02:00 · 2014-11-05 17:50:31 +00:00 · 2014-11-05 17:50:31 +00:00 · f71fd8445d
commit f71fd8445d
parent f5377aa441 17cca3c555
16 changed files with 141 additions and 1021 deletions
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -8,5 +8,4 @@ from parameterization.observable_array import ObsAr
 from gp import GP
 from sparse_gp import SparseGP
 from svigp import SVIGP
 from mapping import *
--- a/GPy/core/gp.py
+++ b/GPy/core/gp.py
@ -26,7 +26,7 @@ class GP(Model):
    :param Y: output observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :param likelihood: a GPy likelihood
-    :param :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference_method: The inference method to use for this GP
+    :param inference_method: The :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference method to use for this GP
    :rtype: model object
    :param Norm normalizer:
        normalize the outputs Y.
@ -95,10 +95,13 @@ class GP(Model):
    def set_XY(self, X=None, Y=None):
        """
-        Set the input / output of the model
+        Set the input / output data of the model
-        
+        This is useful if we wish to change our existing data but maintain the same model
        :param X: input observations
        :type X: np.ndarray
        :param Y: output observations
        :type Y: np.ndarray
        """
        self.update_model(False)
        if Y is not None:
@ -129,22 +132,39 @@ class GP(Model):
    def set_X(self,X):
        """
-        Set the input of the model
+        Set the input data of the model
        :param X: input observations
        :type X: np.ndarray
        """
        self.set_XY(X=X)
    def set_Y(self,Y):
        """
-        Set the input of the model
+        Set the output data of the model
        :param X: output observations
        :type X: np.ndarray
        """
        self.set_XY(Y=Y)
    def parameters_changed(self):
        """
        Method that is called upon any changes to :class:`~GPy.core.parameterization.param.Param` variables within the model.
        In particular in the GP class this method reperforms inference, recalculating the posterior and log marginal likelihood and gradients of the model
        .. warning::
            This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
            this method yourself, there may be unexpected consequences.
        """
        self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
        self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
        self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
    def log_likelihood(self):
        """
        The log marginal likelihood of the model, :math:`p(\mathbf{y})`, this is the objective function of the model being optimised
        """
        return self._log_marginal_likelihood
    def _raw_predict(self, _Xnew, full_cov=False, kern=None):
@ -155,12 +175,10 @@ class GP(Model):
        of the prediction is computed. If full_cov is False (default), only the
        diagonal of the covariance is returned.
-        $$
+        .. math::
-        p(f*|X*, X, Y) = \int^{\inf}_{\inf} p(f*|f,X*)p(f|X,Y) df
+            p(f*|X*, X, Y) = \int^{\inf}_{\inf} p(f*|f,X*)p(f|X,Y) df
-                       = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*}
+                        = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*}
-        \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
+            \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance}
        $$
        """
        if kern is None:
            kern = self.kern
@ -185,23 +203,20 @@ class GP(Model):
        Predict the function(s) at the new point(s) Xnew.
        :param Xnew: The points at which to make a prediction
-        :type Xnew: np.ndarray, Nnew x self.input_dim
+        :type Xnew: np.ndarray (Nnew x self.input_dim)
        :param full_cov: whether to return the full covariance matrix, or just
                         the diagonal
        :type full_cov: bool
        :param Y_metadata: metadata about the predicting point to pass to the likelihood
        :param kern: The kernel to use for prediction (defaults to the model
                     kern). this is useful for examining e.g. subprocesses.
-        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
+        :returns: (mean, var, lower_upper):
-        :returns: var: posterior variance, a Numpy array, Nnew x 1 if
+            mean: posterior mean, a Numpy array, Nnew x self.input_dim
-                       full_cov=False, Nnew x Nnew otherwise
+            var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
-        :returns: lower and upper boundaries of the 95% confidence intervals,
+            lower_upper: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
                  Numpy arrays,  Nnew x self.input_dim
           If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        """
        #predict the latent function values
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern)
@ -213,6 +228,16 @@ class GP(Model):
        return mean, var
    def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None):
        """
        Get the predictive quantiles around the prediction at X
        :param X: The points at which to make a prediction
        :type X: np.ndarray (Xnew x self.input_dim)
        :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval
        :type quantiles: tuple
        :returns: list of quantiles for each X and predictive quantiles for interval combination
        :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)]
        """
        m, v = self._raw_predict(X,  full_cov=False)
        if self.normalizer is not None:
            m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v)
@ -225,7 +250,12 @@ class GP(Model):
        Given a set of points at which to predict X* (size [N*,Q]), compute the
        derivatives of the mean and variance. Resulting arrays are sized:
         dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one).
         dv_dX*  -- [N*, Q],    (since all outputs have the same variance)
        :param X: The points at which to get the predictive gradients
        :type X: np.ndarray (Xnew x self.input_dim)
        :returns: dmu_dX, dv_dX
        :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ]
        """
        dmu_dX = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim))
@ -245,12 +275,13 @@ class GP(Model):
        Samples the posterior GP at the points X.
        :param X: The points at which to take the samples.
-        :type X: np.ndarray, Nnew x self.input_dim.
+        :type X: np.ndarray (Nnew x self.input_dim)
        :param size: the number of a posteriori samples.
        :type size: int.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
        :type full_cov: bool.
-        :returns: Ysim: set of simulations, a Numpy array (N x samples).
+        :returns: Ysim: set of simulations
        :rtype: np.ndarray (N x samples)
        """
        m, v = self._raw_predict(X,  full_cov=full_cov)
        if self.normalizer is not None:
@ -268,7 +299,7 @@ class GP(Model):
        Samples the posterior GP at the points X.
        :param X: the points at which to take the samples.
-        :type X: np.ndarray, Nnew x self.input_dim.
+        :type X: np.ndarray (Nnew x self.input_dim.)
        :param size: the number of a posteriori samples.
        :type size: int.
        :param full_cov: whether to return the full covariance matrix, or just the diagonal.
@ -292,6 +323,37 @@ class GP(Model):
        This is a call to plot with plot_raw=True.
        Data will not be plotted in this, as the GP's view of the world
        may live in another space, or units then the data.
        Can plot only part of the data and part of the posterior functions
        using which_data_rowsm which_data_ycols.
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :type plot_limits: np.array
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
        :type which_data_ycols: 'all' or a list of integers
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
        :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param Y_metadata: additional data associated with Y which may be needed
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
@ -325,12 +387,13 @@ class GP(Model):
        :param which_data_rows: which of the training data to plot (default all)
        :type which_data_rows: 'all' or a slice object to slice model.X, model.Y
        :param which_data_ycols: when the data has several columns (independant outputs), only plot these
-        :type which_data_rows: 'all' or a list of integers
+        :type which_data_ycols: 'all' or a list of integers
        :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v.
        :type fixed_inputs: a list of tuples
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param levels: number of levels to plot in a contour plot.
        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
        :type levels: int
        :param samples: the number of a posteriori samples to plot
        :type samples: int
@ -338,11 +401,14 @@ class GP(Model):
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        :type output: integer (first output is 0)
        :param linecol: color of line to plot [Tango.colorsHex['darkBlue']]
-        :type linecol:
+        :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param fillcol: color of fill [Tango.colorsHex['lightBlue']]
-        :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure
+        :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib
        :param Y_metadata: additional data associated with Y which may be needed
        :type Y_metadata: dict
        :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx')
        :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib.
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import models_plots
@ -372,10 +438,8 @@ class GP(Model):
        :type max_f_eval: int
        :messages: whether to display during optimisation
        :type messages: bool
-        :param optimizer: which optimizer to use (defaults to self.preferred optimizer)
+        :param optimizer: which optimizer to use (defaults to self.preferred optimizer), a range of optimisers can be found in :module:`~GPy.inference.optimization`, they include 'scg', 'lbfgs', 'tnc'.
        :type optimizer: string
        TODO: valid args
        """
        self.inference_method.on_optimization_start()
        try:
@ -384,17 +448,17 @@ class GP(Model):
            print "KeyboardInterrupt caught, calling on_optimization_end() to round things up"
            self.inference_method.on_optimization_end()
            raise
-        
+
    def infer_newX(self, Y_new, optimize=True, ):
        """
        Infer the distribution of X for the new observed data *Y_new*.
-        
+
        :param Y_new: the new observed data for inference
        :type Y_new: numpy.ndarray
        :param optimize: whether to optimize the location of new X (True by default)
        :type optimize: boolean
-        :return: a tuple containing the posterior estimation of X and the model that optimize X 
+        :return: a tuple containing the posterior estimation of X and the model that optimize X
-        :rtype: (GPy.core.parameterization.variational.VariationalPosterior or numpy.ndarray, GPy.core.Model)
+        :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`)
        """
        from ..inference.latent_function_inference.inferenceX import infer_newX
        return infer_newX(self, Y_new, optimize=optimize)
--- a/GPy/core/svigp.py
+++ b/GPy/core/svigp.py
@ -1,434 +0,0 @@
 # Copyright (c) 2012, James Hensman and Nicolo' Fusi
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides
 from gp import GP
 import time
 import sys
 class SVIGP(GP):
    """
    Stochastic Variational inference in a Gaussian Process
    :param X: inputs
    :type X: np.ndarray (N x Q)
    :param Y: observed data
    :type Y: np.ndarray of observations (N x D)
    :param batchsize: the size of a h
    Additional kwargs are used as for a sparse GP. They include:
    :param q_u: canonical parameters of the distribution sqehd into a 1D array
    :type q_u: np.ndarray
    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param kernel: the kernel/covariance function. See link kernels
    :type kernel: a GPy kernel
    :param Z: inducing inputs (optional, see note)
    :type Z: np.ndarray (M x Q) | None
    :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
    :type X_uncertainty: np.ndarray (N x Q) | None
    :param Zslices: slices for the inducing inputs (see slicing TODO: link)
    :param M: Number of inducing points (optional, default 10. Ignored if Z is not None)
    :type M: int
    :param beta: noise precision. TODO: ignore beta if doing EP
    :type beta: float
    :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales)
    :type normalize_(X|Y): bool
    """
    def __init__(self, X, Y, kernel, Z, q_u=None, batchsize=10):
        raise NotImplementedError, "This is a work in progress, see github issue "
        GP.__init__(self, X, Y, kernel)
        self.batchsize=batchsize
        self.Z = Z
        self.num_inducing = Z.shape[0]
        self.batchcounter = 0
        self.epochs = 0
        self.iterations = 0
        self.vb_steplength = 0.05
        self.param_steplength = 1e-5
        self.momentum = 0.9
        if q_u is None:
             q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten()))
        self.set_vb_param(q_u)
        self._permutation = np.random.permutation(self.num_data)
        self.load_batch()
        self._param_trace = []
        self._ll_trace = []
        self._grad_trace = []
        #set the adaptive steplength parameters
        #self.hbar_t = 0.0
        #self.tau_t = 100.0
        #self.gbar_t = 0.0
        #self.gbar_t1 = 0.0
        #self.gbar_t2 = 0.0
        #self.hbar_tp = 0.0
        #self.tau_tp = 10000.0
        #self.gbar_tp = 0.0
        #self.adapt_param_steplength = True
        #self.adapt_vb_steplength = True
        #self._param_steplength_trace = []
        #self._vb_steplength_trace = []
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
        self.psi0 = self.kern.Kdiag(self.X_batch)
        self.psi1 = self.kern.K(self.X_batch, self.Z)
        self.psi2 = None
    def dL_dtheta(self):
        dL_dtheta = self.kern._param_grad_helper(self.dL_dKmm, self.Z)
        if self.has_uncertain_inputs:
            dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch)
            dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch)
            dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch)
        else:
            dL_dtheta += self.kern._param_grad_helper(self.dL_dpsi1, self.X_batch, self.Z)
            dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch)
        return dL_dtheta
    def _set_params(self, p, computations=True):
        self.kern._set_params_transformed(p[:self.kern.num_params])
        self.likelihood._set_params(p[self.kern.num_params:])
        if computations:
            self._compute_kernel_matrices()
            self._computations()
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params()))
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def load_batch(self):
        """
        load a batch of data (set self.X_batch and self.Y_batch from self.X, self.Y)
        """
        #if we've seen all the data, start again with them in a new random order
        if self.batchcounter+self.batchsize > self.num_data:
            self.batchcounter = 0
            self.epochs += 1
            self._permutation = np.random.permutation(self.num_data)
        this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize]
        self.X_batch = self.X[this_perm]
        self.Y_batch = self.Y[this_perm]
        self.batchcounter += self.batchsize
        self.data_prop = float(self.batchsize)/self.num_data
        self._compute_kernel_matrices()
        self._computations()
    def _computations(self,do_Kmm=True, do_Kmm_grad=True):
        """
        All of the computations needed. Some are optional, see kwargs.
        """
        if do_Kmm:
            self.Lm = jitchol(self.Kmm)
        # The rather complex computations of self.A
        if self.has_uncertain_inputs:
            if self.likelihood.is_heteroscedastic:
                psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0)
            else:
                psi2_beta = self.psi2.sum(0) * self.likelihood.precision
            evals, evecs = np.linalg.eigh(psi2_beta)
            clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
            tmp = evecs * np.sqrt(clipped_evals)
        else:
            if self.likelihood.is_heteroscedastic:
                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize)))
            else:
                tmp = self.psi1.T * (np.sqrt(self.likelihood.precision))
        tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1)
        self.A = tdot(tmp)
        self.V = self.likelihood.precision*self.likelihood.Y
        self.VmT = np.dot(self.V,self.q_u_expectation[0].T)
        self.psi1V = np.dot(self.psi1.T, self.V)
        self.B = np.eye(self.num_inducing)*self.data_prop + self.A
        self.Lambda = backsub_both_sides(self.Lm, self.B.T)
        self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right')
        self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision
        self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1)
        self.projected_mean = np.dot(self.psi1,self.Kmmi_m)
        # Compute dL_dpsi
        self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize)
        self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1)
        self.dL_dpsi1 = self.dL_dpsi1.T
        dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing))
        if self.has_uncertain_inputs:
            self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0)
        else:
            self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T
            self.dL_dpsi2 = None
        # Compute dL_dKmm
        if do_Kmm_grad:
            tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right')
            tmp += tmp.T
            tmp += -self.output_dim*self.B
            tmp += self.data_prop*self.LQL
            self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp)
        #Compute the gradient of the log likelihood wrt noise variance
        self.partial_for_likelihood =  -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision
        self.partial_for_likelihood +=  (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2
    def log_likelihood(self):
        """
        As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now.
        NB. self.batchsize is the size of the batch, not the size of X_all
        """
        assert not self.likelihood.is_heteroscedastic
        A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision))
        B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K
        Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm)))
        C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing)
        C += -0.5*np.sum(self.LQL * self.B)
        D = -0.5*self.likelihood.precision*self.likelihood.trYYT
        E = np.sum(self.V*self.projected_mean)
        return (A+B+C+D+E)/self.data_prop
    def _log_likelihood_gradients(self):
        return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop
    def vb_grad_natgrad(self):
        """
        Compute the gradients of the lower bound wrt the canonical and
        Expectation parameters of u.
        Note that the natural gradient in either is given by the gradient in
        the other (See Hensman et al 2012 Fast Variational inference in the
        conjugate exponential Family)
        """
        # Gradient for eta
        dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec
        Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1)
        dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop
        # Gradients for theta
        S = self.q_u_cov
        Si = self.q_u_prec
        m = self.q_u_mean
        dL_dSi = -mdot(S,dL_dmmT_S, S)
        dL_dmhSi = -2*dL_dSi
        dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm)
        return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten()))
    def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5):
        param_step = 0.
        #Iterate!
        for i in range(iterations):
            #store the current configuration for plotting later
            self._param_trace.append(self._get_params())
            self._ll_trace.append(self.log_likelihood() + self.log_prior())
            #load a batch
            self.load_batch()
            #compute the (stochastic) gradient
            natgrads = self.vb_grad_natgrad()
            grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients())
            self._grad_trace.append(grads)
            #compute the steps in all parameters
            vb_step = self.vb_steplength*natgrads[0]
            if (self.epochs>=1):#only move the parameters after the first epoch
                param_step = self.momentum*param_step + self.param_steplength*grads
            else:
                param_step = 0.
            self.set_vb_param(self.get_vb_param() + vb_step)
            #Note: don't recompute everything here, wait until the next iteration when we have a new batch
            self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False)
            #print messages if desired
            if i and (not i%print_interval):
                print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood()
                print np.round(np.mean(self._grad_trace[-print_interval:],0),3)
                sys.stdout.flush()
            #callback
            if i and not i%callback_interval:
                callback(self) # Change this to callback()
                time.sleep(0.01)
            if self.epochs > 10:
                self._adapt_steplength()
            self.iterations += 1
    def _adapt_steplength(self):
        if self.adapt_vb_steplength:
            # self._adaptive_vb_steplength()
            self._adaptive_vb_steplength_KL()
        self._vb_steplength_trace.append(self.vb_steplength)
        assert self.vb_steplength > 0
        if self.adapt_param_steplength:
            self._adaptive_param_steplength()
            # self._adaptive_param_steplength_log()
            # self._adaptive_param_steplength_from_vb()
        self._param_steplength_trace.append(self.param_steplength)
    def _adaptive_param_steplength(self):
        decr_factor = 0.02
        g_tp = self._transform_gradients(self._log_likelihood_gradients())
        self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp
        self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp)
        new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp
        #- hack
        new_param_steplength *= decr_factor
        self.param_steplength = (self.param_steplength + new_param_steplength)/2
        #-
        self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1
    def _adaptive_param_steplength_log(self):
        stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000)
        self.param_steplength = stp[self.iterations]
    def _adaptive_param_steplength_log2(self):
        self.param_steplength = (self.iterations + 0.001)**-0.5
    def _adaptive_param_steplength_from_vb(self):
        self.param_steplength = self.vb_steplength * 0.01
    def _adaptive_vb_steplength(self):
        decr_factor = 0.1
        g_t = self.vb_grad_natgrad()[0]
        self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t
        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t)
        new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t
        #- hack
        new_vb_steplength *= decr_factor
        self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2
        #-
        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
    def _adaptive_vb_steplength_KL(self):
        decr_factor = 0.1
        natgrad = self.vb_grad_natgrad()
        g_t1 = natgrad[0]
        g_t2 = natgrad[1]
        self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1
        self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2
        self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2)
        self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t
        self.vb_steplength *= decr_factor
        self.tau_t = self.tau_t*(1-self.vb_steplength) + 1
    def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False):
        """Internal helper function for making predictions, does not account for normalization"""
        #TODO: make this more efficient!
        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
        tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi)
        if X_variance_new is None:
            Kx = self.kern.K(X_new,self.Z)
            mu = np.dot(Kx,self.Kmmi_m)
            if full_cov:
                Kxx = self.kern.K(X_new)
                var = Kxx - mdot(Kx,tmp,Kx.T)
            else:
                Kxx = self.kern.Kdiag(X_new)
                var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None]
            return mu, var
        else:
            assert X_variance_new.shape == X_new.shape
            Kx = self.kern.psi1(self.Z,X_new, X_variance_new)
            mu = np.dot(Kx,self.Kmmi_m)
            Kxx = self.kern.psi0(self.Z,X_new,X_variance_new)
            psi2 = self.kern.psi2(self.Z,X_new,X_variance_new)
            diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1)
            if full_cov:
                raise NotImplementedError
            else:
                return mu, diag_var[:,None]
    def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False):
        # normalize X values
        Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
        if X_variance_new is not None:
            X_variance_new = X_variance_new / self._Xscale ** 2
        # here's the actual prediction by the GP model
        mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
        return mean, var, _025pm, _975pm
    def set_vb_param(self,vb_param):
        """set the distribution q(u) from the canonical parameters"""
        self.q_u_canonical_flat = vb_param.copy()
        self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing)
        self.q_u_prec = -2.*self.q_u_canonical[1]
        self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec)
        self.q_u_Li = q_u_Li
        self.q_u_logdet = -tmp
        self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1)
        self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim)
    def get_vb_param(self):
        """
        Return the canonical parameters of the distribution q(u)
        """
        return self.q_u_canonical_flat
    def plot(self, *args, **kwargs):
        """
        See GPy.plotting.matplot_dep.svig_plots.plot
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import svig_plots
        svig_plots.plot(self,*args,**kwargs)
    def plot_traces(self):
        """
        See GPy.plotting.matplot_dep.svig_plots.plot_traces
        """
        assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
        from ..plotting.matplot_dep import svig_plots
        svig_plots.plot_traces(self)
--- a/GPy/examples/init.py
+++ b/GPy/examples/init.py
@ -4,6 +4,4 @@
 import classification
 import regression
 import dimensionality_reduction
 import tutorials
 import stochastic
 import non_gaussian
--- a/GPy/examples/classification.py
+++ b/GPy/examples/classification.py
@ -28,13 +28,13 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True):
    m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing)
    # Contrain all parameters to be positive
-    m.tie_params('.*len')
+    #m.tie_params('.*len')
    m['.*len'] = 10.
    m.update_likelihood_approximation()
    # Optimize
    if optimize:
-        m.optimize(max_iters=max_iters)
+        for _ in range(5):
            m.optimize(max_iters=int(max_iters/5))
    print(m)
    #Test
@ -150,7 +150,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti
    print m
    return m
-def toy_heaviside(seed=default_seed, optimize=True, plot=True):
+def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True):
    """
    Simple 1D classification example using a heavy side gp transformation
@ -166,16 +166,18 @@ def toy_heaviside(seed=default_seed, optimize=True, plot=True):
    Y[Y.flatten() == -1] = 0
    # Model definition
-    noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside())
+    kernel = GPy.kern.RBF(1)
-    likelihood = GPy.likelihoods.EP(Y, noise_model)
+    likelihood = GPy.likelihoods.Bernoulli(gp_link=GPy.likelihoods.link_functions.Heaviside())
-    m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
+    ep = GPy.inference.latent_function_inference.expectation_propagation.EP()
    m = GPy.core.GP(X=data['X'], Y=Y, kernel=kernel, likelihood=likelihood, inference_method=ep, name='gp_classification_heaviside')
    #m = GPy.models.GPClassification(data['X'], likelihood=likelihood)
    # Optimize
    if optimize:
        m.update_likelihood_approximation()
        # Parameters optimization:
-        m.optimize()
+        for _ in range(5):
-        #m.pseudo_EM()
+            m.optimize(max_iters=int(max_iters/5))
        print m
    # Plot
    if plot:
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -368,7 +368,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
                      max_iters=2e4,
                      ):
    from GPy import kern
-    from GPy.models import BayesianGPLVM
+    from GPy.models.bayesian_gplvm_minibatch import BayesianGPLVMMiniBatch
    D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
    _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, plot_sim)
@ -379,7 +379,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
    Ymissing = Y.copy()
    Ymissing[inan] = _np.nan
-    m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
+    m = BayesianGPLVMMiniBatch(Ymissing, Q, init="random", num_inducing=num_inducing,
                      kernel=k, missing_data=True)
    m.Yreal = Y
@ -624,7 +624,10 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose
    if in_place:
        # Make figure move in place.
        data['Y'][:, 0:3] = 0.0
-    m = GPy.models.GPLVM(data['Y'], 2, normalize_Y=True)
+    Y = data['Y']
    Y_mean = Y.mean(0)
    Y_std = Y.std(0)
    m = GPy.models.GPLVM((Y-Y_mean)/Y_std, 2)
    if optimize: m.optimize(messages=verbose, max_f_eval=10000)
    if plot:
--- a/GPy/examples/non_gaussian.py
+++ b/GPy/examples/non_gaussian.py
@ -59,7 +59,7 @@ def student_t_approx(optimize=True, plot=True):
    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
    laplace_inf = GPy.inference.latent_function_inference.Laplace()
    m3 = GPy.core.GP(X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf)
-    m3['.*t_noise'].constrain_bounded(1e-6, 10.)
+    m3['.*t_scale2'].constrain_bounded(1e-6, 10.)
    m3['.*white'].constrain_fixed(1e-5)
    m3.randomize()
@ -67,7 +67,7 @@ def student_t_approx(optimize=True, plot=True):
    t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd)
    laplace_inf = GPy.inference.latent_function_inference.Laplace()
    m4 = GPy.core.GP(X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf)
-    m4['.*t_noise'].constrain_bounded(1e-6, 10.)
+    m4['.*t_scale2'].constrain_bounded(1e-6, 10.)
    m4['.*white'].constrain_fixed(1e-5)
    m4.randomize()
    print m4
@ -124,6 +124,7 @@ def student_t_approx(optimize=True, plot=True):
    return m1, m2, m3, m4
 def boston_example(optimize=True, plot=True):
    raise NotImplementedError("Needs updating")
    import sklearn
    from sklearn.cross_validation import KFold
    optimizer='bfgs'
@ -152,8 +153,8 @@ def boston_example(optimize=True, plot=True):
        noise = 1e-1 #np.exp(-2)
        rbf_len = 0.5
        data_axis_plot = 4
-        kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelstu = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
-        kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
+        kernelgp = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1])
        #Baseline
        score_folds[0, n] = rmse(Y_test, np.mean(Y_train))
@ -162,8 +163,8 @@ def boston_example(optimize=True, plot=True):
        print "Gauss GP"
        mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy())
        mgp.constrain_fixed('.*white', 1e-5)
-        mgp['rbf_len'] = rbf_len
+        mgp['.*len'] = rbf_len
-        mgp['noise'] = noise
+        mgp['.*noise'] = noise
        print mgp
        if optimize:
            mgp.optimize(optimizer=optimizer, messages=messages)
@ -198,9 +199,9 @@ def boston_example(optimize=True, plot=True):
            stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution)
            mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood)
            mstu_t.constrain_fixed('.*white', 1e-5)
-            mstu_t.constrain_bounded('.*t_noise', 0.0001, 1000)
+            mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000)
            mstu_t['rbf_len'] = rbf_len
-            mstu_t['.*t_noise'] = noise
+            mstu_t['.*t_scale2'] = noise
            print mstu_t
            if optimize:
                mstu_t.optimize(optimizer=optimizer, messages=messages)
--- a/GPy/examples/stochastic.py
+++ b/GPy/examples/stochastic.py
@ -1,40 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 try:
    import pylab as pb
 except:
    pass
 import numpy as np
 import GPy
 def toy_1d(optimize=True, plot=True):
    N = 2000
    M = 20
    #create data
    X = np.linspace(0,32,N)[:,None]
    Z = np.linspace(0,32,M)[:,None]
    Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.)
    m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z)
    m.constrain_bounded('noise_variance',1e-3,1e-1)
    m.constrain_bounded('white_variance',1e-3,1e-1)
    m.param_steplength = 1e-4
    if plot:
        fig = pb.figure()
        ax = fig.add_subplot(111)
        def cb(foo):
            ax.cla()
            m.plot(ax=ax,Z_height=-3)
            ax.set_ylim(-3,3)
            fig.canvas.draw()
    if optimize:
        m.optimize(500, callback=cb, callback_interval=1)
    if plot:
        m.plot_traces()
    return m
--- a/GPy/examples/tutorials.py
+++ b/GPy/examples/tutorials.py
@ -1,156 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 """
 Code of Tutorials
 """
 try:
    import pylab as pb
    pb.ion()
 except:
    pass
 import numpy as np
 import GPy
 def tuto_GP_regression(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
    X = np.random.uniform(-3.,3.,(20,1))
    Y = np.sin(X) + np.random.randn(20,1)*0.05
    kernel = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
    m = GPy.models.GPRegression(X, Y, kernel)
    print m
    if plot:
        m.plot()
    m.constrain_positive('')
    m.unconstrain('')               # may be used to remove the previous constrains
    m.constrain_positive('.*rbf_variance')
    m.constrain_bounded('.*lengthscale',1.,10. )
    m.constrain_fixed('.*noise',0.0025)
    if optimize:
        m.optimize()
        m.optimize_restarts(num_restarts = 10)
    #######################################################
    #######################################################
    # sample inputs and outputs
    X = np.random.uniform(-3.,3.,(50,2))
    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05
    # define kernel
    ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2)
    # create simple GP model
    m = GPy.models.GPRegression(X, Y, ker)
    # contrain all parameters to be positive
    m.constrain_positive('')
    # optimize and plot
    if optimize:
        m.optimize('tnc', max_f_eval = 1000)
    if plot:
        m.plot()
    print m
    return(m)
 def tuto_kernel_overview(optimize=True, plot=True):
    """The detailed explanations of the commands used in this file can be found in the tutorial section"""
    ker1 = GPy.kern.rbf(1)  # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.)
    ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.)
    ker3 = GPy.kern.rbf(1, .5, .5)
    print ker2
    if plot:
        ker1.plot()
        ker2.plot()
        ker3.plot()
    k1 = GPy.kern.rbf(1,1.,2.)
    k2 = GPy.kern.Matern32(1, 0.5, 0.2)
    # Product of kernels
    k_prod = k1.prod(k2)                        # By default, tensor=False
    k_prodtens = k1.prod(k2,tensor=True)
    # Sum of kernels
    k_add = k1.add(k2)                          # By default, tensor=False
    k_addtens = k1.add(k2,tensor=True)
    k1 = GPy.kern.rbf(1,1.,2)
    k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5)
    k = k1 * k2  # equivalent to k = k1.prod(k2)
    print k
    # Simulate sample paths
    X = np.linspace(-5,5,501)[:,None]
    Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1)
    k1 = GPy.kern.rbf(1)
    k2 = GPy.kern.Matern32(1)
    k3 = GPy.kern.white(1)
    k = k1 + k2 + k3
    print k
    k.constrain_positive('.*var')
    k.constrain_fixed(np.array([1]),1.75)
    k.tie_params('.*len')
    k.unconstrain('white')
    k.constrain_bounded('white',lower=1e-5,upper=.5)
    print k
    k_cst = GPy.kern.bias(1,variance=1.)
    k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3)
    Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True)
    print Kanova
    # sample inputs and outputs
    X = np.random.uniform(-3.,3.,(40,2))
    Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:])
    # Create GP regression model
    m = GPy.models.GPRegression(X, Y, Kanova)
    if plot:
        fig = pb.figure(figsize=(5,5))
        ax = fig.add_subplot(111)
        m.plot(ax=ax)
        pb.figure(figsize=(20,3))
        pb.subplots_adjust(wspace=0.5)
        axs = pb.subplot(1,5,1)
        m.plot(ax=axs)
        pb.subplot(1,5,2)
        pb.ylabel("=   ",rotation='horizontal',fontsize='30')
        axs = pb.subplot(1,5,3)
        m.plot(ax=axs, which_parts=[False,True,False,False])
        pb.ylabel("cst          +",rotation='horizontal',fontsize='30')
        axs = pb.subplot(1,5,4)
        m.plot(ax=axs, which_parts=[False,False,True,False])
        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
        axs = pb.subplot(1,5,5)
        pb.ylabel("+   ",rotation='horizontal',fontsize='30')
        m.plot(ax=axs, which_parts=[False,False,False,True])
    return(m)
 def model_interaction(optimize=True, plot=True):
    X = np.random.randn(20,1)
    Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5.
    k = GPy.kern.rbf(1) + GPy.kern.bias(1)
    m = GPy.models.GPRegression(X, Y, kernel=k)
    return m
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -4,7 +4,6 @@
 from gp_regression import GPRegression
 from gp_classification import GPClassification
 from sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput
 from svigp_regression import SVIGPRegression
 from sparse_gp_classification import SparseGPClassification
 from gplvm import GPLVM
 from bcgplvm import BCGPLVM
--- a/GPy/models/gp_multioutput_regression.py
+++ b/GPy/models/gp_multioutput_regression.py
@ -1,171 +0,0 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import GP
 from .. import likelihoods
 from .. import kern
 class GPMultioutputRegression(GP):
    """
    Multiple output Gaussian process with Gaussian noise
    This is a wrapper around the models.GP class, with a set of sensible defaults
    :param X_list: input observations
    :type X_list: list of numpy arrays (num_data_output_i x input_dim), one array per output
    :param Y_list: observed values
    :type Y_list: list of numpy arrays (num_data_output_i x 1), one array per output
    :param kernel_list: GPy kernels, defaults to rbf
    :type kernel_list: list of GPy kernels
    :param noise_variance_list: noise parameters per output, defaults to 1.0 for every output
    :type noise_variance_list: list of floats
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :param rank: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
    :type rank: integer
    """
    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,rank=1):
        self.output_dim = len(Y_list)
        assert len(X_list) == self.output_dim, 'Number of outputs do not match length of inputs list.'
        #Inputs indexing
        i = 0
        index = []
        for x,y in zip(X_list,Y_list):
            assert x.shape[0] == y.shape[0]
            index.append(np.repeat(i,x.size)[:,None])
            i += 1
        index = np.vstack(index)
        X = np.hstack([np.vstack(X_list),index])
        original_dim = X.shape[1] - 1
        #Mixed noise likelihood definition
        likelihood = likelihoods.Gaussian_Mixed_Noise(Y_list,noise_params=noise_variance_list,normalize=normalize_Y)
        #Coregionalization kernel definition
        if kernel_list is None:
            kernel_list = [kern.rbf(original_dim)]
        mkernel = kern.build_lcm(input_dim=original_dim, output_dim=self.output_dim, kernel_list = kernel_list, rank=rank)
        self.multioutput = True
        GP.__init__(self, X, likelihood, mkernel, normalize_X=normalize_X)
        self.ensure_default_constraints()
    def _add_output_index(self,X,output):
        """
        In a multioutput model, appends an index column to X to specify the output it is related to.
        :param X: Input data
        :type X: np.ndarray, N x self.input_dim
        :param output: output X is related to
        :type output: integer in {0,..., output_dim-1}
        .. Note:: For multiple non-independent outputs models only.
        """
        assert hasattr(self,'multioutput'), 'This function is for multiple output models only.'
        index = np.ones((X.shape[0],1))*output
        return np.hstack((X,index))
    def plot_single_output(self, X, output):
        """
        A simple wrapper around self.plot, with appropriate setting of the fixed_inputs argument
        """
        raise NotImplementedError
    def _raw_predict_single_output(self, _Xnew, output, which_parts='all', full_cov=False,stop=False):
        """
        For a specific output, calls _raw_predict() at the new point(s) _Xnew.
        This functions calls _add_output_index(), so _Xnew should not have an index column specifying the output.
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param output: output to predict
        :type output: integer in {0,..., output_dim-1}
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        .. Note:: For multiple non-independent outputs models only.
        """
        _Xnew = self._add_output_index(_Xnew, output)
        return self._raw_predict(_Xnew, which_parts=which_parts,full_cov=full_cov, stop=stop)
    def predict_single_output(self, Xnew,output=0, which_parts='all', full_cov=False, likelihood_args=dict()):
        """
        For a specific output, calls predict() at the new point(s) Xnew.
        This functions calls _add_output_index(), so Xnew should not have an index column specifying the output.
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.input_dim
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the full covariance matrix, or just the diagonal
        :type full_cov: bool
        :returns: mean: posterior mean,  a Numpy array, Nnew x self.input_dim
        :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :returns: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.input_dim
        .. Note:: For multiple non-independent outputs models only.
        """
        Xnew = self._add_output_index(Xnew, output)
        return self.predict(Xnew, which_parts=which_parts, full_cov=full_cov, likelihood_args=likelihood_args)
    def plot_single_output_f(self, output=None, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None):
        """
        For a specific output, in a multioutput model, this function works just as plot_f on single output models.
        :param output: which output to plot (for multiple output models only)
        :type output: integer (first output is 0)
        :param samples: the number of a posteriori samples to plot
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        :type resolution: int
        :param full_cov:
        :type full_cov: bool
                :param fignum: figure to plot on.
        :type fignum: figure number
        :param ax: axes to plot on.
        :type ax: axes handle
        """
        assert output is not None, "An output must be specified."
        assert len(self.likelihood.noise_model_list) > output, "The model has only %s outputs." %(self.output_dim + 1)
        if which_data == 'all':
            which_data = slice(None)
        if ax is None:
            fig = pb.figure(num=fignum)
            ax = fig.add_subplot(111)
        if self.X.shape[1] == 2:
            Xu = self.X[self.X[:,-1]==output ,0:1]
            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
            Xnew_indexed = self._add_output_index(Xnew,output)
            m, v = self._raw_predict(Xnew_indexed, which_parts=which_parts)
            if samples:
                Ysim = self.posterior_samples_f(Xnew_indexed, samples, which_parts=which_parts, full_cov=True)
                for yi in Ysim.T:
                    ax.plot(Xnew, yi[:,None], Tango.colorsHex['darkBlue'], linewidth=0.25)
            gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v), axes=ax)
            ax.plot(Xu[which_data], self.likelihood.Y[self.likelihood.index==output][:,None], 'kx', mew=1.5)
            ax.set_xlim(xmin, xmax)
            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            ax.set_ylim(ymin, ymax)
--- a/GPy/models/sparse_gp_multioutput_regression.py
+++ b/GPy/models/sparse_gp_multioutput_regression.py
@ -1,80 +0,0 @@
 # Copyright (c) 2013, Ricardo Andrade
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import SparseGP
 from .. import likelihoods
 from .. import kern
 from ..util import multioutput
 class SparseGPMultioutputRegression(SparseGP):
    """
    Sparse multiple output Gaussian process with Gaussian noise
    This is a wrapper around the models.SparseGP class, with a set of sensible defaults
    :param X_list: input observations
    :type X_list: list of numpy arrays (num_data_output_i x input_dim), one array per output
    :param Y_list: observed values
    :type Y_list: list of numpy arrays (num_data_output_i x 1), one array per output
    :param kernel_list: GPy kernels, defaults to rbf
    :type kernel_list: list of GPy kernels
    :param noise_variance_list: noise parameters per output, defaults to 1.0 for every output
    :type noise_variance_list: list of floats
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :param Z_list: inducing inputs (optional)
    :type Z_list: list of numpy arrays (num_inducing_output_i x input_dim), one array per output | empty list
    :param num_inducing: number of inducing inputs per output, defaults to 10 (ignored if Z_list is not empty)
    :type num_inducing: integer
    :param rank: number tuples of the corregionalization parameters 'coregion_W' (see coregionalize kernel documentation)
    :type rank: integer
    """
    #NOTE not tested with uncertain inputs
    def __init__(self,X_list,Y_list,kernel_list=None,noise_variance_list=None,normalize_X=False,normalize_Y=False,Z_list=[],num_inducing=10,rank=1):
        self.output_dim = len(Y_list)
        assert len(X_list) == self.output_dim, 'Number of outputs do not match length of inputs list.'
        #Inducing inputs list
        if len(Z_list):
            assert len(Z_list) == self.output_dim, 'Number of outputs do not match length of inducing inputs list.'
        else:
            if isinstance(num_inducing,np.int):
                num_inducing = [num_inducing] * self.output_dim
            num_inducing = np.asarray(num_inducing)
            assert num_inducing.size == self.output_dim, 'Number of outputs do not match length of inducing inputs list.'
            for ni,X in zip(num_inducing,X_list):
                i = np.random.permutation(X.shape[0])[:ni]
                Z_list.append(X[i].copy())
        #Inputs and inducing inputs indexing
        i = 0
        index = []
        index_z = []
        for x,y,z in zip(X_list,Y_list,Z_list):
            assert x.shape[0] == y.shape[0]
            index.append(np.repeat(i,x.size)[:,None])
            index_z.append(np.repeat(i,z.size)[:,None])
            i += 1
        index = np.vstack(index)
        index_z = np.vstack(index_z)
        X = np.hstack([np.vstack(X_list),index])
        Z = np.hstack([np.vstack(Z_list),index_z])
        original_dim = X.shape[1] - 1
        #Mixed noise likelihood definition
        likelihood = likelihoods.Gaussian_Mixed_Noise(Y_list,noise_params=noise_variance_list,normalize=normalize_Y)
        #Coregionalization kernel definition
        if kernel_list is None:
            kernel_list = [kern.rbf(original_dim)]
        mkernel = kern.build_lcm(input_dim=original_dim, output_dim=self.output_dim, kernel_list = kernel_list, rank=rank)
        self.multioutput = True
        SparseGP.__init__(self, X, likelihood, mkernel, Z=Z, normalize_X=normalize_X)
        self.constrain_fixed('.*iip_\d+_1')
        self.ensure_default_constraints()
--- a/GPy/models/svigp_regression.py
+++ b/GPy/models/svigp_regression.py
@ -1,45 +0,0 @@
 # Copyright (c) 2012, James Hensman
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from ..core import SVIGP
 from .. import likelihoods
 from .. import kern
 class SVIGPRegression(SVIGP):
    """
    Gaussian Process model for regression
    This is a thin wrapper around the SVIGP class, with a set of sensible defalts
    :param X: input observations
    :param Y: observed values
    :param kernel: a GPy kernel, defaults to rbf+white
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :param normalize_Y:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_Y: False|True
    :rtype: model object
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, q_u=None, batchsize=10, normalize_Y=False):
        # kern defaults to rbf (plus white for stability)
        if kernel is None:
            kernel = kern.rbf(X.shape[1], variance=1., lengthscale=4.) + kern.white(X.shape[1], 1e-3)
        # Z defaults to a subset of the data
        if Z is None:
            i = np.random.permutation(X.shape[0])[:num_inducing]
            Z = X[i].copy()
        else:
            assert Z.shape[1] == X.shape[1]
        # likelihood defaults to Gaussian
        likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y)
        SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize)
        self.load_batch()
--- a/GPy/plotting/matplot_dep/init.py
+++ b/GPy/plotting/matplot_dep/init.py
@ -6,7 +6,6 @@ import models_plots
 import priors_plots
 import variational_plots
 import kernel_plots
 import svig_plots
 import dim_reduction_plots
 import mapping_plots
 import Tango
--- a/doc/index.rst
+++ b/doc/index.rst
@ -5,23 +5,28 @@
 Welcome to GPy's documentation!
 ===============================
 For a quick start, you can have a look at one of the tutorials:
-* `Basic Gaussian process regression <tuto_GP_regression.html>`_  
+`GPy <http://sheffieldml.github.io/GPy/>`_ is a Gaussian Process (GP) framework written in Python, from the Sheffield machine learning group.
 * `Interacting with models <tuto_interacting_with_models.html>`_
 * `A kernel overview <tuto_kernel_overview.html>`_ 
 * `Writing new kernels <tuto_creating_new_kernels.html>`_
 * `Writing new models <tuto_creating_new_models.html>`_
 * `Parameterization handles <tuto_parameterized.html>`_
-You may also be interested by some examples in the GPy/examples folder.
+The `GPy homepage <http://sheffieldml.github.io/GPy/>`_ contains tutorials for users and further information on the project, including installation instructions.
 This documentation is mostly aimed at developers interacting closely with the code-base.
 The code can be found on our `Github project page <https://github.com/SheffieldML/GPy>`_. It is open source and provided under the BSD license.
 .. * `Basic Gaussian process regression <tuto_GP_regression.html>`_  
 .. * `Interacting with models <tuto_interacting_with_models.html>`_
 .. * `A kernel overview <tuto_kernel_overview.html>`_ 
 .. * `Writing new kernels <tuto_creating_new_kernels.html>`_
 .. * `Writing new models <tuto_creating_new_models.html>`_
 .. * `Parameterization handles <tuto_parameterized.html>`_
 .. You may also be interested by some examples in the GPy/examples folder.
 Contents:
 .. toctree::
   :maxdepth: 2
   installation
   GPy
--- a/doc/log.txt
+++ b/doc/log.txt
@ -1,7 +1,7 @@
 /home/alans/Work/GPy/GPy/__init__.py:docstring of GPy.load:1: WARNING: Inline interpreted text or phrase reference start-string without end-string.
-/home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP:7: WARNING: Field list ends without a blank line; unexpected unindent.
+/home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP.optimize:8: ERROR: Unknown interpreted text role "module".
 /home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP:10: ERROR: Unexpected indentation.
 /home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP.predictive_gradients:5: ERROR: Unexpected indentation.
 /home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP.predictive_gradients:8: WARNING: Block quote ends without a blank line; unexpected unindent.
 /home/alans/Work/GPy/GPy/core/model.py:docstring of GPy.core.model.Model.optimize_restarts:29: WARNING: Explicit markup ends without a blank line; unexpected unindent.
 /home/alans/Work/GPy/doc/GPy.core.rst:65: WARNING: autodoc: failed to import module u'GPy.core.symbolic'; the following exception was raised:
 Traceback (most recent call last):
@ -23,31 +23,6 @@ ImportError: No module named lambdify
 /home/alans/Work/GPy/GPy/core/parameterization/ties_and_remappings.py:docstring of GPy.core.parameterization.ties_and_remappings.Tie:18: SEVERE: Unexpected section title or transition.
 ================================
 /home/alans/Work/GPy/doc/GPy.examples.rst:18: WARNING: autodoc: failed to import module u'GPy.examples.coreg_example'; the following exception was raised:
 Traceback (most recent call last):
  File "/home/alans/anaconda/envs/GPy/lib/python2.7/site-packages/sphinx/ext/autodoc.py", line 335, in import_object
    __import__(self.modname)
  File "/home/alans/Work/GPy/GPy/examples/coreg_example.py", line 44, in <module>
    m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1], Y_list=[Y1], Z_list = [Z1], kernel=kern)
  File "/home/alans/Work/GPy/GPy/core/parameterization/parameterized.py", line 19, in __call__
    self = super(ParametersChangedMeta, self).__call__(*args, **kw)
  File "/home/alans/Work/GPy/GPy/models/sparse_gp_coregionalized_regression.py", line 66, in __init__
    self['.*inducing'][:,-1].fix()
  File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 444, in constrain_fixed
    self.notify_observers(self, None if trigger_parent else -np.inf)
  File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 148, in notify_observers
    [callble(self, which=which) for _, _, callble in self.observers]
  File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 1077, in _pass_through_notify_observers
    self.notify_observers(which=which)
  File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 148, in notify_observers
    [callble(self, which=which) for _, _, callble in self.observers]
  File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 1075, in _parameters_changed_notification
    self.parameters_changed()
  File "/home/alans/Work/GPy/GPy/core/sparse_gp.py", line 66, in parameters_changed
    self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
  File "/home/alans/Work/GPy/GPy/inference/latent_function_inference/var_dtc.py", line 170, in inference
    import ipdb; ipdb.set_trace()
 ImportError: No module named ipdb
 /home/alans/Work/GPy/GPy/kern/_src/coregionalize.py:docstring of GPy.kern._src.coregionalize.Coregionalize:5: ERROR: Unexpected indentation.
 /home/alans/Work/GPy/doc/GPy.kern._src.rst:73: WARNING: autodoc: failed to import module u'GPy.kern._src.hierarchical'; the following exception was raised:
 Traceback (most recent call last):
@ -187,6 +162,7 @@ Parameter handles
 :py:class:`~GPy.core.parameterization.param.Param`
 ===========
 /home/alans/Work/GPy/doc/installation.rst:: WARNING: document isn't included in any toctree
 /home/alans/Work/GPy/doc/kernel_implementation.rst:: WARNING: document isn't included in any toctree
 /home/alans/Work/GPy/doc/modules.rst:: WARNING: document isn't included in any toctree
 /home/alans/Work/GPy/doc/tuto_GP_regression.rst:: WARNING: document isn't included in any toctree