Merge branch 'devel' into link_functions

2026-07-23 17:01:06 +02:00 · 2013-06-04 16:24:13 +01:00 · 2013-06-04 16:24:13 +01:00 · 2dfda94176
commit 2dfda94176
parent 775ec15543 d84ae52d41
28 changed files with 436 additions and 477 deletions
--- a/.travis.yml
+++ b/.travis.yml
@ -10,6 +10,11 @@ virtualenv:
 before_install: 
  - sudo apt-get install -qq python-scipy python-pip
  - sudo apt-get install -qq python-matplotlib
  # Workaround for a permissions issue with Travis virtual machine images
  # that breaks Python's multiprocessing:
  # https://github.com/travis-ci/travis-cookbooks/issues/155
  - sudo rm -rf /dev/shm
  - sudo ln -s /run/shm /dev/shm
 install:
  - pip install --upgrade numpy==1.7.1 
--- a/GPy/init.py
+++ b/GPy/init.py
@ -2,16 +2,17 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-import kern
+import core
 import models
 import inference
 import util
 import examples
 from core import priors
 import likelihoods
 import testing
 from numpy.testing import Tester
 from nose.tools import nottest
 import kern
 from core import priors
@nottest
 def tests():
--- a/GPy/core/GP.py
+++ b/GPy/core/GP.py
@ -0,0 +1,146 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import linalg
 import pylab as pb
 from .. import kern
 from ..util.linalg import pdinv, mdot, tdot
 #from ..util.plot import gpplot,  Tango
 from ..likelihoods import EP
 from gp_base import GPBase
 class GP(GPBase):
    """
    Gaussian Process model for regression and EP
    :param X: input observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :parm likelihood: a GPy likelihood
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :rtype: model object
    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
    :type powerep: list
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        super(GP, self).__init__(X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
    def _set_params(self, p):
        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
        self.K = self.kern.K(self.X)
        self.K += self.likelihood.covariance_matrix
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
        # the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
            #alpha = np.dot(self.Ki, self.likelihood.Y)
            alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)
            self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
        else:
            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        self.likelihood.fit_full(self.kern.K(self.X))
        self._set_params(self._get_params())  # update the GP
    def _model_fit_term(self):
        """
        Computes the model fit using YYT if it's available
        """
        if self.likelihood.YYT is None:
            tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
            return -0.5 * np.sum(np.square(tmp))
            #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
        else:
            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
    def log_likelihood(self):
        """
        The log marginal likelihood of the GP.
        For an EP model,  can be written as the log likelihood of a regression
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
        return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
    def _log_likelihood_gradients(self):
        """
        The gradient of all parameters.
        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
        """
        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
        """
        Internal helper function for making predictions, does not account
        for normalization or likelihood
        """
        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
        #KiKx = np.dot(self.Ki, Kx)
        KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1)
        mu = np.dot(KiKx.T, self.likelihood.Y)
        if full_cov:
            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
            var = Kxx - np.dot(KiKx.T, Kx)
        else:
            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
            var = var[:, None]
        if stop:
            debug_this
        return mu, var
    def predict(self, Xnew, which_parts='all', full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.Q
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.D
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D
           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        """
        # normalize X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
        return mean, var, _025pm, _975pm
--- a/GPy/core/init.py
+++ b/GPy/core/init.py
@ -1,7 +1,8 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-
+from GP import GP
 from sparse_GP import sparse_GP
 from model import *
 from parameterised import *
 import priors
--- a/GPy/core/gp_base.py
+++ b/GPy/core/gp_base.py
@ -0,0 +1,129 @@
 import numpy as np
 import model
 from .. import kern
 from ..util.plot import gpplot, Tango, x_frame1D, x_frame2D
 import pylab as pb
 class GPBase(model.model):
    """
    Gaussian Process model for holding shared behaviour between
    sprase_GP and GP models
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        self.X = X
        assert len(self.X.shape) == 2
        self.N, self.Q = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
        self.likelihood = likelihood
        assert self.X.shape[0] == self.likelihood.data.shape[0]
        self.N, self.D = self.likelihood.data.shape
        if normalize_X:
            self._Xmean = X.mean(0)[None, :]
            self._Xstd = X.std(0)[None, :]
            self.X = (X.copy() - self._Xmean) / self._Xstd
        super(GPBase, self).__init__()
        # All leaf nodes should call self._set_params(self._get_params()) at
        # the end
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False):
        """
        Plot the GP's view of the world, where the data is normalized and the
        likelihood is Gaussian.
        :param samples: the number of a posteriori samples to plot
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, we've no implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
        """
        if which_data == 'all':
            which_data = slice(None)
        if self.X.shape[1] == 1:
            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
            if samples == 0:
                m, v = self._raw_predict(Xnew, which_parts=which_parts)
                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v))
                pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
            else:
                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None])
                for i in range(samples):
                    pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
            pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
            pb.xlim(xmin, xmax)
            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            pb.ylim(ymin, ymax)
            if hasattr(self, 'Z'):
                pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
        elif self.X.shape[1] == 2:
            resolution = resolution or 50
            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
            m, v = self._raw_predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
            pb.scatter(self.X[:, 0], self.X[:, 1], 40, self.likelihood.Y, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
            pb.xlim(xmin[0], xmax[0])
            pb.ylim(xmin[1], xmax[1])
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
        """
        TODO: Docstrings!
        :param levels: for 2D plotting, the number of contour levels to use
        """
        # TODO include samples
        if which_data == 'all':
            which_data = slice(None)
        if self.X.shape[1] == 1:
            Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now
            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
            for d in range(m.shape[1]):
                gpplot(Xnew, m[:,d], lower[:,d], upper[:,d])
                pb.plot(Xu[which_data], self.likelihood.data[which_data,d], 'kx', mew=1.5)
            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            pb.xlim(xmin, xmax)
            pb.ylim(ymin, ymax)
        elif self.X.shape[1] == 2:  # FIXME
            resolution = resolution or 50
            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
            Yf = self.likelihood.Y.flatten()
            pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
            pb.xlim(xmin[0], xmax[0])
            pb.ylim(xmin[1], xmax[1])
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -22,8 +22,8 @@ class model(parameterised):
        self.priors = [None for i in range(self._get_params().size)]
        self.optimization_runs = []
        self.sampling_runs = []
        self._set_params(self._get_params())
        self.preferred_optimizer = 'tnc'
        #self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes
    def _get_params(self):
        raise NotImplementedError, "this needs to be implemented to use the model class"
    def _set_params(self, x):
@ -212,7 +212,7 @@ class model(parameterised):
        currently_constrained = self.all_constrained_indices()
        to_make_positive = []
        for s in positive_strings:
-            for i in self.grep_param_names(s):
+            for i in self.grep_param_names(".*"+s):
                if not (i in currently_constrained):
                    #to_make_positive.append(re.escape(param_names[i]))
                    to_make_positive.append(i)
--- a/GPy/core/parameterised.py
+++ b/GPy/core/parameterised.py
@ -64,21 +64,21 @@ class parameterised(object):
        m['var'] = 2.           # > sets all parameters matching 'var' to 2.
        m['var'] = <array-like> # > sets parameters matching 'var' to <array-like>
        """
-    def get(self, name):
+    def get(self, regexp):
        warnings.warn(self._get_set_deprecation, FutureWarning, stacklevel=2)
-        return self[name]
+        return self[regexp]
-    def set(self, name, val):
+    def set(self, regexp, val):
        warnings.warn(self._get_set_deprecation, FutureWarning, stacklevel=2)
-        self[name] = val
+        self[regexp] = val
-    def __getitem__(self, name, return_names=False):
+    def __getitem__(self, regexp, return_names=False):
        """
        Get a model parameter by name.  The name is applied as a regular
        expression and all parameters that match that regular expression are
        returned.
        """
-        matches = self.grep_param_names(name)
+        matches = self.grep_param_names(regexp)
        if len(matches):
            if return_names:
                return self._get_params()[matches], np.asarray(self._get_param_names())[matches].tolist()
@ -103,8 +103,8 @@ class parameterised(object):
        else:
            raise AttributeError, "no parameter matches %s" % name
-    def tie_params(self, which):
+    def tie_params(self, regexp):
-        matches = self.grep_param_names(which)
+        matches = self.grep_param_names(regexp)
        assert matches.size > 0, "need at least something to tie together"
        if len(self.tied_indices):
            assert not np.any(matches[:, None] == np.hstack(self.tied_indices)), "Some indices are already tied!"
@ -119,28 +119,23 @@ class parameterised(object):
        """Unties all parameters by setting tied_indices to an empty list."""
        self.tied_indices = []
-    def grep_param_names(self, expr):
+    def grep_param_names(self, regexp):
        """
-        Arguments
+        :param regexp: regular expression to select parameter names
-        ---------
+        :type regexp: re | str | int
-        expr -- can be a regular expression object or a string to be turned into regular expression object.
+        :rtype: the indices of self._get_param_names which match the regular expression.
-        Returns
+        Note:-
-        -------
+          Other objects are passed through - i.e. integers which weren't meant for grepping
        the indices of self._get_param_names which match the regular expression.
        Notes
        -----
        Other objects are passed through - i.e. integers which weren't meant for grepping
        """
-        if type(expr) in [str, np.string_, np.str]:
+        if type(regexp) in [str, np.string_, np.str]:
-            expr = re.compile(expr)
+            regexp = re.compile(regexp)
-            return np.nonzero([expr.search(name) for name in self._get_param_names()])[0]
+            return np.nonzero([regexp.match(name) for name in self._get_param_names()])[0]
-        elif type(expr) is re._pattern_type:
+        elif type(regexp) is re._pattern_type:
-            return np.nonzero([expr.search(name) for name in self._get_param_names()])[0]
+            return np.nonzero([regexp.match(name) for name in self._get_param_names()])[0]
        else:
-            return expr
+            return regexp
    def Nparam_transformed(self):
        removed = 0
@ -152,9 +147,9 @@ class parameterised(object):
        return len(self._get_params()) - removed
-    def unconstrain(self, which):
+    def unconstrain(self, regexp):
        """Unconstrain matching parameters.  does not untie parameters"""
-        matches = self.grep_param_names(which)
+        matches = self.grep_param_names(regexp)
        #tranformed contraints:
        for match in matches:
@ -178,17 +173,17 @@ class parameterised(object):
        else:
            self.fixed_indices, self.fixed_values = [], []
-    def constrain_negative(self, which):
+    def constrain_negative(self, regexp):
        """ Set negative constraints. """
-        self.constrain(which, transformations.negative_exponent())
+        self.constrain(regexp, transformations.negative_exponent())
-    def constrain_positive(self, which):
+    def constrain_positive(self, regexp):
        """ Set positive constraints. """
-        self.constrain(which, transformations.logexp())
+        self.constrain(regexp, transformations.logexp())
-    def constrain_bounded(self, which,lower, upper):
+    def constrain_bounded(self, regexp,lower, upper):
        """ Set bounded constraints. """
-        self.constrain(which, transformations.logistic(lower, upper))
+        self.constrain(regexp, transformations.logistic(lower, upper))
    def all_constrained_indices(self):
        if len(self.constrained_indices) or len(self.fixed_indices):
@ -196,10 +191,10 @@ class parameterised(object):
        else:
            return np.empty(shape=(0,))
-    def constrain(self,which,transform):
+    def constrain(self,regexp,transform):
        assert isinstance(transform,transformations.transformation)
-        matches = self.grep_param_names(which)
+        matches = self.grep_param_names(regexp)
        overlap = set(matches).intersection(set(self.all_constrained_indices()))
        if overlap:
            self.unconstrain(np.asarray(list(overlap)))
@ -214,11 +209,11 @@ class parameterised(object):
        x[matches] = transform.initialize(x[matches])
        self._set_params(x)
-    def constrain_fixed(self, which, value=None):
+    def constrain_fixed(self, regexp, value=None):
        """
        Arguments
        ---------
-        :param which: np.array(dtype=int), or regular expression object or string
+        :param regexp: np.array(dtype=int), or regular expression object or string
        :param value: a float to fix the matched values to. If the value is not specified,
                 the parameter is fixed to the current value
@ -227,7 +222,7 @@ class parameterised(object):
        Fixing a parameter which is tied to another, or constrained in some way will result in an error.
        To fix multiple parameters to the same value, simply pass a regular expression which matches both parameter names, or pass both of the indexes
        """
-        matches = self.grep_param_names(which)
+        matches = self.grep_param_names(regexp)
        assert not np.any(matches[:, None] == self.all_constrained_indices()), "Some indices are already constrained"
        self.fixed_indices.append(matches)
        if value != None:
--- a/GPy/models/sparse_GP.py
+++ b/GPy/models/sparse_GP.py
@ -4,13 +4,12 @@
 import numpy as np
 import pylab as pb
 from ..util.linalg import mdot, jitchol, tdot, symmetrify, backsub_both_sides, chol_inv
 from ..util.plot import gpplot
 from .. import kern
 from GP import GP
 from scipy import linalg
 from ..likelihoods import Gaussian
 from gp_base import GPBase
-class sparse_GP(GP):
+class sparse_GP(GPBase):
    """
    Variational sparse GP model
@ -31,6 +30,8 @@ class sparse_GP(GP):
    """
    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
        super(sparse_GP, self).__init__(X, likelihood, kernel, normalize_X=normalize_X)
        self.Z = Z
        self.M = Z.shape[0]
        self.likelihood = likelihood
@ -42,13 +43,13 @@ class sparse_GP(GP):
            self.has_uncertain_inputs = True
            self.X_variance = X_variance
-        GP.__init__(self, X, likelihood, kernel=kernel, normalize_X=normalize_X)
+        if normalize_X:
            self.Z = (self.Z.copy() - self._Xmean) / self._Xstd
        # normalize X uncertainty also
        if self.has_uncertain_inputs:
            self.X_variance /= np.square(self._Xstd)
    def _compute_kernel_matrices(self):
        # kernel computations, using BGPLVM notation
        self.Kmm = self.kern.K(self.Z)
@ -139,8 +140,6 @@ class sparse_GP(GP):
            self.partial_for_likelihood += 0.5 * self.D * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision)
            self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - np.sum(np.square(self._LBi_Lmi_psi1V)))
    def log_likelihood(self):
        """ Compute the (lower bound on the) log marginal likelihood """
        if self.likelihood.is_heteroscedastic:
@ -282,3 +281,17 @@ class sparse_GP(GP):
        return mean, var, _025pm, _975pm
    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
        super(sparse_GP, self).plot(samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20)
        if self.X.shape[1] == 1:
            Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now
            if self.has_uncertain_inputs:
                pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            Zu = self.Z * self._Xstd + self._Xmean
            pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
                # pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
        elif self.X.shape[1] == 2:  # FIXME
            pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo')
--- a/GPy/examples/dimensionality_reduction.py
+++ b/GPy/examples/dimensionality_reduction.py
@ -7,6 +7,7 @@ from matplotlib import pyplot as plt
 import GPy
 from GPy.models.Bayesian_GPLVM import Bayesian_GPLVM
 from GPy.util.datasets import swiss_roll_generated
 from GPy.core.transformations import logexp
 default_seed = np.random.seed(123344)
@ -17,11 +18,11 @@ def BGPLVM(seed=default_seed):
    D = 4
    # generate GPLVM-like data
    X = np.random.rand(N, Q)
-    k = GPy.kern.rbf(Q)  + GPy.kern.white(Q, 0.00001)
+    k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001)
    K = k.K(X)
    Y = np.random.multivariate_normal(np.zeros(N), K, D).T
-    k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True)  + GPy.kern.white(Q)
+    k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True) + GPy.kern.white(Q)
    # k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q)
    # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001)
    # k = GPy.kern.rbf(Q, ARD = False)  + GPy.kern.white(Q, 0.00001)
@ -187,8 +188,8 @@ def _simulate_sincos(D1, D2, D3, N, M, Q, plot_sim=False):
    Y2 = S2.dot(np.random.randn(S2.shape[1], D2))
    Y3 = S3.dot(np.random.randn(S3.shape[1], D3))
-    Y1 += .1 * np.random.randn(*Y1.shape)
+    Y1 += .3 * np.random.randn(*Y1.shape)
-    Y2 += .1 * np.random.randn(*Y2.shape)
+    Y2 += .2 * np.random.randn(*Y2.shape)
    Y3 += .1 * np.random.randn(*Y3.shape)
    Y1 -= Y1.mean(0)
@ -262,13 +263,13 @@ def bgplvm_simulation(optimize='scg',
    # m.constrain('variance|noise', logexp_clipped())
    m.ensure_default_constraints()
    m['noise'] = Y.var() / 100.
-    m['linear_variance'] = .01
+    m['linear_variance'] = .001
    if optimize:
        print "Optimizing model:"
-        m.optimize('bfgs', max_iters=max_f_eval,
+        m.optimize('scg', max_iters=max_f_eval,
                   max_f_eval=max_f_eval,
-                   messages=True, gtol=1e-2)
+                   messages=True, gtol=1e-6)
    if plot:
        import pylab
        m.plot_X_1d()
@ -277,23 +278,21 @@ def bgplvm_simulation(optimize='scg',
        m.kern.plot_ARD()
    return m
-def mrd_simulation(optimize=True, plot_sim=False, **kw):
+def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw):
-    D1, D2, D3, N, M, Q = 150, 200, 400, 300, 3, 7
+    D1, D2, D3, N, M, Q = 150, 200, 400, 500, 3, 7
    slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, M, Q, plot_sim)
    from GPy.models import mrd
    from GPy import kern
    from GPy.core.transformations import logexp_clipped
    reload(mrd); reload(kern)
-    k = kern.linear(Q, [0.05] * Q, True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
+    k = kern.linear(Q, [.05] * Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
-    m = mrd.MRD(Ylist, Q=Q, M=M, kernels=k, initx="concat", initz='permute', **kw)
+    m = mrd.MRD(Ylist, Q=Q, M=M, kernels=k, initx="", initz='permute', **kw)
    for i, Y in enumerate(Ylist):
        m['{}_noise'.format(i + 1)] = Y.var() / 100.
    # m.constrain('variance|noise', logexp_clipped(1e-6))
    m.ensure_default_constraints()
    # DEBUG
@ -301,8 +300,10 @@ def mrd_simulation(optimize=True, plot_sim=False, **kw):
    if optimize:
        print "Optimizing Model:"
-        m.optimize('bfgs', messages=1, max_iters=3e3)
+        m.optimize('scg', messages=1, max_iters=5e4, max_f_eval=5e4)
-
+    if plot:
        m.plot_X_1d()
        m.plot_scales()
    return m
 def brendan_faces():
@ -323,7 +324,7 @@ def brendan_faces():
    m.ensure_default_constraints()
    m.optimize('scg', messages=1, max_f_eval=10000)
-    ax = m.plot_latent(which_indices=(0,1))
+    ax = m.plot_latent(which_indices=(0, 1))
    y = m.likelihood.Y[0, :]
    data_show = GPy.util.visualize.image_show(y[None, :], dimensions=(20, 28), transpose=True, invert=False, scale=False)
    lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax)
--- a/GPy/examples/regression.py
+++ b/GPy/examples/regression.py
@ -10,7 +10,7 @@ import numpy as np
 import GPy
-def toy_rbf_1d():
+def toy_rbf_1d(max_nb_eval_optim=100):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d()
@ -19,13 +19,13 @@ def toy_rbf_1d():
    # optimize
    m.ensure_default_constraints()
-    m.optimize()
+    m.optimize(max_f_eval=max_nb_eval_optim)
    # plot
    m.plot()
    print(m)
    return m
-def rogers_girolami_olympics():
+def rogers_girolami_olympics(max_nb_eval_optim=100):
    """Run a standard Gaussian process regression on the Rogers and Girolami olympics data."""
    data = GPy.util.datasets.rogers_girolami_olympics()
@ -34,14 +34,14 @@ def rogers_girolami_olympics():
    # optimize
    m.ensure_default_constraints()
-    m.optimize()
+    m.optimize(max_f_eval=max_nb_eval_optim)
    # plot
    m.plot(plot_limits = (1850, 2050))
    print(m)
    return m
-def toy_rbf_1d_50():
+def toy_rbf_1d_50(max_nb_eval_optim=100):
    """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance."""
    data = GPy.util.datasets.toy_rbf_1d_50()
@ -50,14 +50,14 @@ def toy_rbf_1d_50():
    # optimize
    m.ensure_default_constraints()
-    m.optimize()
+    m.optimize(max_f_eval=max_nb_eval_optim)
    # plot
    m.plot()
    print(m)
    return m
-def silhouette():
+def silhouette(max_nb_eval_optim=100):
    """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper."""
    data = GPy.util.datasets.silhouette()
@ -66,12 +66,12 @@ def silhouette():
    # optimize
    m.ensure_default_constraints()
-    m.optimize(messages=True)
+    m.optimize(messages=True,max_f_eval=max_nb_eval_optim)
    print(m)
    return m
-def coregionalisation_toy2():
+def coregionalisation_toy2(max_nb_eval_optim=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions.
    """
@ -90,8 +90,7 @@ def coregionalisation_toy2():
    m.constrain_fixed('rbf_var',1.)
    m.constrain_positive('kappa')
    m.ensure_default_constraints()
-    m.optimize('sim',max_f_eval=5000,messages=1)
+    m.optimize('sim',messages=1,max_f_eval=max_nb_eval_optim)
    #m.optimize()
    pb.figure()
    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
@ -104,7 +103,7 @@ def coregionalisation_toy2():
    pb.plot(X2[:,0],Y2[:,0],'gx',mew=2)
    return m
-def coregionalisation_toy():
+def coregionalisation_toy(max_nb_eval_optim=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions.
    """
@ -123,7 +122,7 @@ def coregionalisation_toy():
    m.constrain_fixed('rbf_var',1.)
    m.constrain_positive('kappa')
    m.ensure_default_constraints()
-    m.optimize()
+    m.optimize(max_f_eval=max_nb_eval_optim)
    pb.figure()
    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
@ -137,7 +136,7 @@ def coregionalisation_toy():
    return m
-def coregionalisation_sparse():
+def coregionalisation_sparse(max_nb_eval_optim=100):
    """
    A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations.
    """
@ -162,7 +161,7 @@ def coregionalisation_sparse():
    m.constrain_positive('kappa')
    m.constrain_fixed('iip')
    m.ensure_default_constraints()
-    m.optimize_restarts(5,robust=True,messages=1)
+    m.optimize_restarts(5, robust=True, messages=1, max_f_eval=max_nb_eval_optim)
    pb.figure()
    Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1))))
@ -179,7 +178,7 @@ def coregionalisation_sparse():
    return m
-def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000):
+def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000, max_nb_eval_optim=100):
    """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisey mode is higher."""
    # Contour over a range of length scales and signal/noise ratios.
@ -217,7 +216,7 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000
        # optimize
        m.ensure_default_constraints()
-        m.optimize(xtol=1e-6,ftol=1e-6)
+        m.optimize(xtol=1e-6, ftol=1e-6, max_f_eval=max_nb_eval_optim)
        optim_point_x[1] = m.get('rbf_lengthscale')
        optim_point_y[1] = np.log10(m.get('rbf_variance')) - np.log10(m.get('white_variance'));
@ -264,7 +263,7 @@ def _contour_data(data, length_scales, log_SNRs, signal_kernel_call=GPy.kern.rbf
        lls.append(length_scale_lls)
    return np.array(lls)
-def sparse_GP_regression_1D(N = 400, M = 5):
+def sparse_GP_regression_1D(N = 400, M = 5, max_nb_eval_optim=100):
    """Run a 1D example of a sparse GP regression."""
    # sample inputs and outputs
    X = np.random.uniform(-3.,3.,(N,1))
@ -279,11 +278,11 @@ def sparse_GP_regression_1D(N = 400, M = 5):
    m.constrain_positive('(variance|lengthscale|precision)')
    m.checkgrad(verbose=1)
-    m.optimize('tnc', messages = 1)
+    m.optimize('tnc', messages = 1, max_f_eval=max_nb_eval_optim)
    m.plot()
    return m
-def sparse_GP_regression_2D(N = 400, M = 50):
+def sparse_GP_regression_2D(N = 400, M = 50, max_nb_eval_optim=100):
    """Run a 2D example of a sparse GP regression."""
    X = np.random.uniform(-3.,3.,(N,2))
    Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05
@ -294,7 +293,7 @@ def sparse_GP_regression_2D(N = 400, M = 50):
    kernel = rbf + noise
    # create simple GP model
-    m = GPy.models.sparse_GP_regression(X,Y,kernel, M = M)
+    m = GPy.models.sparse_GP_regression(X,Y,kernel, M = M, max_nb_eval_optim=100)
    # contrain all parameters to be positive (but not inducing inputs)
    m.constrain_positive('(variance|lengthscale|precision)')
@ -304,12 +303,12 @@ def sparse_GP_regression_2D(N = 400, M = 50):
    # optimize and plot
    pb.figure()
-    m.optimize('tnc', messages = 1)
+    m.optimize('tnc', messages = 1, max_f_eval=max_nb_eval_optim)
    m.plot()
    print(m)
    return m
-def uncertain_inputs_sparse_regression():
+def uncertain_inputs_sparse_regression(max_nb_eval_optim=100):
    """Run a 1D example of a sparse GP regression with uncertain inputs."""
    # sample inputs and outputs
    S = np.ones((20,1))
@ -327,7 +326,7 @@ def uncertain_inputs_sparse_regression():
    m.constrain_positive('(variance|prec)')
    # optimize and plot
-    m.optimize('tnc', max_f_eval = 1000, messages=1)
+    m.optimize('tnc', messages=1, max_f_eval=max_nb_eval_optim)
    m.plot()
    print(m)
    return m
--- a/GPy/inference/SCG.py
+++ b/GPy/inference/SCG.py
@ -52,7 +52,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto
        ftol = 1e-6
    if gtol is None:
        gtol = 1e-5
-    sigma0 = 1.0e-4
+    sigma0 = 1.0e-8
    fold = f(x, *optargs) # Initial function value.
    function_eval = 1
    fnow = fold
--- a/GPy/likelihoods/Gaussian.py
+++ b/GPy/likelihoods/Gaussian.py
@ -51,11 +51,15 @@ class Gaussian(likelihood):
        return ["noise_variance"]
    def _set_params(self, x):
-        x = float(x)
+        x = np.float64(x)
        if self._variance != x:
-            self.precision = 1. / x
+            if x == 0.:
                self.precision = None
                self.V = None
            else:
                self.precision = 1. / x
                self.V = (self.precision) * self.Y
            self.covariance_matrix = np.eye(self.N) * x
            self.V = (self.precision) * self.Y
            self._variance = x
    def predictive_values(self, mu, var, full_cov):
--- a/GPy/models/Bayesian_GPLVM.py
+++ b/GPy/models/Bayesian_GPLVM.py
@ -5,7 +5,7 @@ import numpy as np
 import pylab as pb
 import sys, pdb
 from GPLVM import GPLVM
-from sparse_GP import sparse_GP
+from ..core import sparse_GP
 from GPy.util.linalg import pdinv
 from ..likelihoods import Gaussian
 from .. import kern
@ -65,6 +65,7 @@ class Bayesian_GPLVM(sparse_GP, GPLVM):
            self._savedABCD = []
        sparse_GP.__init__(self, X, likelihood, kernel, Z=Z, X_variance=X_variance, **kwargs)
        self._set_params(self._get_params())
    @property
    def oldps(self):
--- a/GPy/models/FITC.py
+++ b/GPy/models/FITC.py
@ -7,7 +7,7 @@ from ..util.linalg import mdot, jitchol, chol_inv, tdot, symmetrify,pdinv
 from ..util.plot import gpplot
 from .. import kern
 from scipy import stats, linalg
-from sparse_GP import sparse_GP
+from ..core import sparse_GP
 def backsub_both_sides(L,X):
    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
@ -16,6 +16,9 @@ def backsub_both_sides(L,X):
 class FITC(sparse_GP):
    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
        super(FITC, self).__init__(X, likelihood, kernel, normalize_X=normalize_X)
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
--- a/GPy/models/GP.py
+++ b/GPy/models/GP.py
@ -1,288 +0,0 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
 import numpy as np
 from scipy import linalg
 import pylab as pb
 from .. import kern
 from ..core import model
 from ..util.linalg import pdinv, mdot, tdot
 from ..util.plot import gpplot, x_frame1D, x_frame2D, Tango
 from ..likelihoods import EP
 class GP(model):
    """
    Gaussian Process model for regression and EP
    :param X: input observations
    :param kernel: a GPy kernel, defaults to rbf+white
    :parm likelihood: a GPy likelihood
    :param normalize_X:  whether to normalize the input data before computing (predictions will be in original scales)
    :type normalize_X: False|True
    :rtype: model object
    :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1
    :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.]
    :type powerep: list
    .. Note:: Multiple independent outputs are allowed using columns of Y
    """
    def __init__(self, X, likelihood, kernel, normalize_X=False):
        # parse arguments
        self.X = X
        assert len(self.X.shape) == 2
        self.N, self.Q = self.X.shape
        assert isinstance(kernel, kern.kern)
        self.kern = kernel
        self.likelihood = likelihood
        assert self.X.shape[0] == self.likelihood.data.shape[0]
        self.N, self.D = self.likelihood.data.shape
        # here's some simple normalization for the inputs
        if normalize_X:
            self._Xmean = X.mean(0)[None, :]
            self._Xstd = X.std(0)[None, :]
            self.X = (X.copy() - self._Xmean) / self._Xstd
            if hasattr(self, 'Z'):
                self.Z = (self.Z - self._Xmean) / self._Xstd
        else:
            self._Xmean = np.zeros((1, self.X.shape[1]))
            self._Xstd = np.ones((1, self.X.shape[1]))
        if not hasattr(self,'has_uncertain_inputs'):
            self.has_uncertain_inputs = False
        model.__init__(self)
    def dL_dZ(self):
        """
        TODO: one day we might like to learn Z by gradient methods?
        """
        #FIXME: this doesn;t live here.
        return np.zeros_like(self.Z)
    def _set_params(self, p):
        self.kern._set_params_transformed(p[:self.kern.Nparam_transformed()])
        self.likelihood._set_params(p[self.kern.Nparam_transformed():])
        self.K = self.kern.K(self.X)
        self.K += self.likelihood.covariance_matrix
        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
        # the gradient of the likelihood wrt the covariance matrix
        if self.likelihood.YYT is None:
            #alpha = np.dot(self.Ki, self.likelihood.Y)
            alpha,_ = linalg.lapack.flapack.dpotrs(self.L, self.likelihood.Y,lower=1)
            self.dL_dK = 0.5 * (tdot(alpha) - self.D * self.Ki)
        else:
            #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1)
            tmp, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(tmp.T), lower=1)
            self.dL_dK = 0.5 * (tmp - self.D * self.Ki)
    def _get_params(self):
        return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params()))
    def _get_param_names(self):
        return self.kern._get_param_names_transformed() + self.likelihood._get_param_names()
    def update_likelihood_approximation(self):
        """
        Approximates a non-gaussian likelihood using Expectation Propagation
        For a Gaussian likelihood, no iteration is required:
        this function does nothing
        """
        self.likelihood.fit_full(self.kern.K(self.X))
        self._set_params(self._get_params())  # update the GP
    def _model_fit_term(self):
        """
        Computes the model fit using YYT if it's available
        """
        if self.likelihood.YYT is None:
            tmp, _ = linalg.lapack.flapack.dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1)
            return -0.5 * np.sum(np.square(tmp))
            #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y)))
        else:
            return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT))
    def log_likelihood(self):
        """
        The log marginal likelihood of the GP.
        For an EP model,  can be written as the log likelihood of a regression
        model for a new variable Y* = v_tilde/tau_tilde, with a covariance
        matrix K* = K + diag(1./tau_tilde) plus a normalization term.
        """
        return -0.5 * self.D * self.K_logdet + self._model_fit_term() + self.likelihood.Z
    def _log_likelihood_gradients(self):
        """
        The gradient of all parameters.
        Note, we use the chain rule: dL_dtheta = dL_dK * d_K_dtheta
        """
        return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK))))
    def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False):
        """
        Internal helper function for making predictions, does not account
        for normalization or likelihood
        """
        Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T
        #KiKx = np.dot(self.Ki, Kx)
        KiKx, _ = linalg.lapack.flapack.dpotrs(self.L, np.asfortranarray(Kx), lower=1)
        mu = np.dot(KiKx.T, self.likelihood.Y)
        if full_cov:
            Kxx = self.kern.K(_Xnew, which_parts=which_parts)
            var = Kxx - np.dot(KiKx.T, Kx)
        else:
            Kxx = self.kern.Kdiag(_Xnew, which_parts=which_parts)
            var = Kxx - np.sum(np.multiply(KiKx, Kx), 0)
            var = var[:, None]
        if stop:
            debug_this
        return mu, var
    def predict(self, Xnew, which_parts='all', full_cov=False):
        """
        Predict the function(s) at the new point(s) Xnew.
        Arguments
        ---------
        :param Xnew: The points at which to make a prediction
        :type Xnew: np.ndarray, Nnew x self.Q
        :param which_parts:  specifies which outputs kernel(s) to use in prediction
        :type which_parts: ('all', list of bools)
        :param full_cov: whether to return the folll covariance matrix, or just the diagonal
        :type full_cov: bool
        :rtype: posterior mean,  a Numpy array, Nnew x self.D
        :rtype: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise
        :rtype: lower and upper boundaries of the 95% confidence intervals, Numpy arrays,  Nnew x self.D
           If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
           This is to allow for different normalizations of the output dimensions.
        """
        # normalize X values
        Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
        mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts)
        # now push through likelihood
        mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
        return mean, var, _025pm, _975pm
    def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False):
        """
        Plot the GP's view of the world, where the data is normalized and the
        likelihood is Gaussian.
        :param samples: the number of a posteriori samples to plot
        :param which_data: which if the training data to plot (default all)
        :type which_data: 'all' or a slice object to slice self.X, self.Y
        :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
        :param which_parts: which of the kernel functions to plot (additively)
        :type which_parts: 'all', or list of bools
        :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
        Plot the posterior of the GP.
          - In one dimension, the function is plotted with a shaded region identifying two standard deviations.
          - In two dimsensions, a contour-plot shows the mean predicted function
          - In higher dimensions, we've no implemented this yet !TODO!
        Can plot only part of the data and part of the posterior functions
        using which_data and which_functions
        """
        if which_data == 'all':
            which_data = slice(None)
        if self.X.shape[1] == 1:
            Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits)
            if samples == 0:
                m, v = self._raw_predict(Xnew, which_parts=which_parts)
                gpplot(Xnew, m, m - 2 * np.sqrt(v), m + 2 * np.sqrt(v))
                pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
            else:
                m, v = self._raw_predict(Xnew, which_parts=which_parts, full_cov=True)
                Ysim = np.random.multivariate_normal(m.flatten(), v, samples)
                gpplot(Xnew, m, m - 2 * np.sqrt(np.diag(v)[:, None]), m + 2 * np.sqrt(np.diag(v))[:, None])
                for i in range(samples):
                    pb.plot(Xnew, Ysim[i, :], Tango.colorsHex['darkBlue'], linewidth=0.25)
            pb.plot(self.X[which_data], self.likelihood.Y[which_data], 'kx', mew=1.5)
            pb.xlim(xmin, xmax)
            ymin, ymax = min(np.append(self.likelihood.Y, m - 2 * np.sqrt(np.diag(v)[:, None]))), max(np.append(self.likelihood.Y, m + 2 * np.sqrt(np.diag(v)[:, None])))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            pb.ylim(ymin, ymax)
            if hasattr(self, 'Z'):
                pb.plot(self.Z, self.Z * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
        elif self.X.shape[1] == 2:
            resolution = resolution or 50
            Xnew, xmin, xmax, xx, yy = x_frame2D(self.X, plot_limits, resolution)
            m, v = self._raw_predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            pb.contour(xx, yy, m, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
            pb.scatter(Xorig[:, 0], Xorig[:, 1], 40, Yorig, linewidth=0, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max())
            pb.xlim(xmin[0], xmax[0])
            pb.ylim(xmin[1], xmax[1])
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
    def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20):
        """
        TODO: Docstrings!
        :param levels: for 2D plotting, the number of contour levels to use
        """
        # TODO include samples
        if which_data == 'all':
            which_data = slice(None)
        if self.X.shape[1] == 1:
            Xu = self.X * self._Xstd + self._Xmean  # NOTE self.X are the normalized values now
            Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits)
            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
            for d in range(m.shape[1]):
                gpplot(Xnew, m[:,d], lower[:,d], upper[:,d])
                pb.plot(Xu[which_data], self.likelihood.data[which_data,d], 'kx', mew=1.5)
            if self.has_uncertain_inputs:
                pb.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0],
                            xerr=2 * np.sqrt(self.X_variance[which_data, 0]),
                            ecolor='k', fmt=None, elinewidth=.5, alpha=.5)
            ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper))
            ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin)
            pb.xlim(xmin, xmax)
            pb.ylim(ymin, ymax)
            if hasattr(self, 'Z'):
                Zu = self.Z * self._Xstd + self._Xmean
                pb.plot(Zu, Zu * 0 + pb.ylim()[0], 'r|', mew=1.5, markersize=12)
                    # pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_variance.flatten()))
        elif self.X.shape[1] == 2:  # FIXME
            resolution = resolution or 50
            Xnew, xx, yy, xmin, xmax = x_frame2D(self.X, plot_limits, resolution)
            x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution)
            m, var, lower, upper = self.predict(Xnew, which_parts=which_parts)
            m = m.reshape(resolution, resolution).T
            pb.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet)
            Yf = self.likelihood.Y.flatten()
            pb.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.)
            pb.xlim(xmin[0], xmax[0])
            pb.ylim(xmin[1], xmax[1])
            if hasattr(self, 'Z'):
                pb.plot(self.Z[:, 0], self.Z[:, 1], 'wo')
        else:
            raise NotImplementedError, "Cannot define a frame with more than two input dimensions"
--- a/GPy/models/GPLVM.py
+++ b/GPy/models/GPLVM.py
@ -8,7 +8,7 @@ import sys, pdb
 from .. import kern
 from ..core import model
 from ..util.linalg import pdinv, PCA
-from GP import GP
+from ..core import GP
 from ..likelihoods import Gaussian
 from .. import util
 from GPy.util import plot_latent
@ -32,7 +32,8 @@ class GPLVM(GP):
        if kernel is None:
            kernel = kern.rbf(Q, ARD=Q>1) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2))
        likelihood = Gaussian(Y, normalize=normalize_Y)
-        GP.__init__(self, X, likelihood, kernel, **kwargs)
+        super(GPLVM, self).__init__(self, X, likelihood, kernel, **kwargs)
        self._set_params(self._get_params())
    def initialise_latent(self, init, Q, Y):
        if init == 'PCA':
--- a/GPy/models/GP_regression.py
+++ b/GPy/models/GP_regression.py
@ -3,7 +3,7 @@
 import numpy as np
-from GP import GP
+from ..core import GP
 from .. import likelihoods
 from .. import kern
@ -31,4 +31,5 @@ class GP_regression(GP):
        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        super(GP_regression, self).__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -2,9 +2,9 @@
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
-from GP import GP
+#from GP import GP
 #from sparse_GP import sparse_GP
 from GP_regression import GP_regression
 from sparse_GP import sparse_GP
 from sparse_GP_regression import sparse_GP_regression
 from GPLVM import GPLVM
 from warped_GP import warpedGP
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@ -7,7 +7,7 @@ from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
 from ..util.plot import gpplot
 from .. import kern
 from scipy import stats, linalg
-from sparse_GP import sparse_GP
+from ..core import sparse_GP
 def backsub_both_sides(L,X):
    """ Return L^-T * X * L^-1, assumuing X is symmetrical and L is lower cholesky"""
@ -36,12 +36,12 @@ class generalized_FITC(sparse_GP):
    """
    def __init__(self, X, likelihood, kernel, Z, X_variance=None, normalize_X=False):
        self.Z = Z
        self.M = self.Z.shape[0]
        self.true_precision = likelihood.precision
-        sparse_GP.__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_variance=None, normalize_X=False)
+        super(generalized_FITC, self).__init__(self, X, likelihood, kernel=kernel, Z=self.Z, X_variance=None, normalize_X=False)
        self._set_params(self._get_params())
    def _set_params(self, p):
        self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
--- a/GPy/models/mrd.py
+++ b/GPy/models/mrd.py
@ -5,7 +5,7 @@ Created on 10 Apr 2013
 '''
 from GPy.core import model
 from GPy.models.Bayesian_GPLVM import Bayesian_GPLVM
-from GPy.models.sparse_GP import sparse_GP
+from GPy.core import sparse_GP
 from GPy.util.linalg import PCA
 from scipy import linalg
 import numpy
@ -33,8 +33,11 @@ class MRD(model):
        Initial latent space
    :param X_variance:
        Initial latent space variance
-    :param init: [PCA|random]
+    :param init: [cooncat|single|random]
-        initialization method to use
+        initialization method to use: 
            *concat: PCA on concatenated outputs
            *single: PCA on each output
            *random: random
    :param M:
        number of inducing inputs to use
    :param Z:
@ -77,6 +80,7 @@ class MRD(model):
        self.MQ = self.M * self.Q
        model.__init__(self) # @UndefinedVariable
        self._set_params(self._get_params())
    @property
    def X(self):
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@ -3,7 +3,7 @@
 import numpy as np
-from sparse_GP import sparse_GP
+from ..core import sparse_GP
 from .. import likelihoods
 from .. import kern
 from ..likelihoods import likelihood
@ -43,4 +43,5 @@ class sparse_GP_regression(sparse_GP):
        #likelihood defaults to Gaussian
        likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y)
-        sparse_GP.__init__(self, X, likelihood, kernel, Z, normalize_X=normalize_X)
+        super(sparse_GP_regression, self).__init__(self, X, likelihood, kernel, Z, normalize_X=normalize_X)
        self._set_params(self._get_params())
--- a/GPy/models/warped_GP.py
+++ b/GPy/models/warped_GP.py
@ -9,7 +9,7 @@ from ..util.linalg import pdinv
 from ..util.plot import gpplot
 from ..util.warping_functions import *
 from GP_regression import GP_regression
-from GP import GP
+from ..core import GP
 from .. import likelihoods
 from .. import kern
@ -29,7 +29,8 @@ class warpedGP(GP):
        self.predict_in_warped_space = False
        likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
-        GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
+        super(warpedGP, self).__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
        self._set_params(self._get_params())
    def _scale_data(self, Y):
        self._Ymax = Y.max()
--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@ -1,61 +0,0 @@
 the predict method for GP_regression returns a covariance matrix which is a bad idea as this takes a lot to compute, it's also confusing for first time users. Should only be returned if the user explicitly requests it. 
 FIXED
 When computing kernel.K for kernels like rbf, you can't compute a version with rbf.K(X) you have to do rbf.K(X, X)
 FIXED
 Change Youter to YYT (Youter doesn't mean anything for matrices).
 FIXED
 Change get_param and set_param to get_params and set_params
 FIXED
 Fails in weird ways if you pass a integer as the input instead of a double to the kernel.
 FIXED
 The Matern kernels (at least the 52) still is working in the ARD manner which means it wouldn't run for very large input dimension. Needs to be fixed to match the RBF.
 FIXED
 Implementing new covariances is too complicated at the moment. We need a barebones example of what to implement and where. Commenting in the covariance matrices needs to be improved. It's not clear to a user what all the psi parts are for. Maybe we need a cut down and simplified example to help with this (perhaps a cut down version of the RBF?). And then we should provide a simple list of what you need to do to get a new kernel going.
 TODO, a priority for this release
 Missing kernels: polynomial, rational quadratic.
 TODO, should be straightforward when the above is fixed.
 Need an implementation of scaled conjugate gradients for the optimizers.
 UPSTREAM: scipy are tidying up the optimize module. let's wait for their next release. 
 Need an implementation of gradient descent for the optimizers (works well with GP-LVM for small random initializations)
 As above. 
 Need Carl Rasmussen's permission to add his conjugate gradients algorithm. In fact, we can just provide a hook for it, and post a separate python implementation of his algorithm.
 Any word from Carl yet?
 Get constrain param by default inside model creation.
 Well, we have ensure_default_constraints. There are some techinical difficulties in doing it inside model creation, so perhaps this is something for a later release.
 Bug when running classification.crescent_data()
 TODO.
 Do all optimizers work only in terms of function evaluations? Do we need to check for one that uses iterations?
 Upstream: Waiting for the new scipy, where the optimisers have been unified. Obviously it's be much better to be able to specify a unified set of args. 
 Tolerances for optimizers, do we need to introduce some standardization? At the moment does each have its own defaults?
 Upstream, as above
 A dictionary for parameter storage? So we can go through names easily?
 Wontfix. Dictionaries bring up all kinds of problems since they're not ordered. it's easy enough to do:
 for val, name in zip(m._get_params(), m._get_param_names()): foobar
 A flag on covariance functions that indicates when they are not associated with an underlying function (like white noise or a coregionalization matrix).
 TODO, agree this would be helpful.
 Diagonal noise covariance function
 TODO this is now straightforward using the likelihood framework, or as a kern. NF also requires a similar kind of kern function (a fixed form kernel)
 Long term: automatic Lagrange multiplier calculation for optimizers: constrain two parameters in an unusual way and the model automatically does the Lagrangian. Also augment the parameters with new ones, so define data variance to be white noise plus RBF variance and optimize over that and signal to noise ratio ... for example constrain the sum of variances to equal the known variance of the data.
 Randomize doesn't seem to cover a wide enough range for restarts ... try it for a model where inputs are widely spaced apart and length scale is too short. Sampling from N(0,1) is too conservative. Dangerous for people who naively use restarts. Since we have the model we could maybe come up with some sensible heuristics for setting these things. Maybe we should also consider having '.initialize()'. If we can't do this well we should disable the restart method.
 Excellent proposal, but lots of work: suggest leaving for the next release?
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -8,7 +8,7 @@ import GPy
 class KernelTests(unittest.TestCase):
    def test_kerneltie(self):
        K = GPy.kern.rbf(5, ARD=True)
-        K.tie_params('[01]')
+        K.tie_params('.*[01]')
        K.constrain_fixed('2')
        X = np.random.rand(5,5)
        Y = np.ones((5,1))
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -22,7 +22,7 @@ class GradientTests(unittest.TestCase):
        self.X2D = np.random.uniform(-3.,3.,(40,2))
        self.Y2D = np.sin(self.X2D[:,0:1]) * np.sin(self.X2D[:,1:2])+np.random.randn(40,1)*0.05
-    def check_model_with_white(self, kern, model_type='GP_regression', dimension=1, constraint=''):
+    def check_model_with_white(self, kern, model_type='GP_regression', dimension=1):
        #Get the correct gradients
        if dimension == 1:
            X = self.X1D
@ -37,7 +37,7 @@ class GradientTests(unittest.TestCase):
        noise = GPy.kern.white(dimension)
        kern = kern + noise
        m = model_fit(X, Y, kernel=kern)
-        m.constrain_positive(constraint)
+        m.ensure_default_constraints()
        m.randomize()
        # contrain all parameters to be positive
        self.assertTrue(m.checkgrad())
@ -135,12 +135,12 @@ class GradientTests(unittest.TestCase):
    def test_sparse_GP_regression_rbf_white_kern_1d(self):
        ''' Testing the sparse GP regression with rbf kernel with white kernel on 1d data '''
        rbf = GPy.kern.rbf(1)
-        self.check_model_with_white(rbf, model_type='sparse_GP_regression', dimension=1, constraint='(variance|lengthscale|precision)')
+        self.check_model_with_white(rbf, model_type='sparse_GP_regression', dimension=1)
    def test_sparse_GP_regression_rbf_white_kern_2D(self):
        ''' Testing the sparse GP regression with rbf and white kernel on 2d data '''
        rbf = GPy.kern.rbf(2)
-        self.check_model_with_white(rbf, model_type='sparse_GP_regression', dimension=2, constraint='(variance|lengthscale|precision)')
+        self.check_model_with_white(rbf, model_type='sparse_GP_regression', dimension=2)
    def test_GPLVM_rbf_bias_white_kern_2D(self):
        """ Testing GPLVM with rbf + bias and white kernel """
@ -150,7 +150,7 @@ class GradientTests(unittest.TestCase):
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
        m = GPy.models.GPLVM(Y, Q, kernel = k)
-        m.constrain_positive('(rbf|bias|white)')
+        m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())
    def test_GPLVM_rbf_linear_white_kern_2D(self):
@ -161,7 +161,7 @@ class GradientTests(unittest.TestCase):
        K = k.K(X)
        Y = np.random.multivariate_normal(np.zeros(N),K,D).T
        m = GPy.models.GPLVM(Y, Q, init = 'PCA', kernel = k)
-        m.constrain_positive('(linear|bias|white)')
+        m.ensure_default_constraints()
        self.assertTrue(m.checkgrad())
    def test_GP_EP_probit(self):
--- a/GPy/util/plot.py
+++ b/GPy/util/plot.py
@ -1,4 +1,4 @@
-# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
+# #Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -0,0 +1,4 @@
 include *.txt
 recursive-include docs *.txt
 include *.md
 recursive-include docs *.md
--- a/setup.py
+++ b/setup.py
@ -23,14 +23,12 @@ setup(name = 'GPy',
      package_data = {'GPy': ['GPy/examples']},
      py_modules = ['GPy.__init__'],
      long_description=read('README.md'),
      #ext_modules =  [Extension(name = 'GPy.kern.lfmUpsilonf2py',
      #          sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])],
      install_requires=['sympy', 'numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1', 'nose'],
      extras_require = {
        'docs':['Sphinx', 'ipython'],
      },
      #setup_requires=['sphinx'],
      #cmdclass = {'build_sphinx': BuildDoc},
      classifiers=[
      "License :: OSI Approved :: BSD License"],
      #ext_modules =  [Extension(name = 'GPy.kern.lfmUpsilonf2py',
      #          sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])],
      )
`@ -1,4 +1,4 @@`
	`# Copyright (c) 2012, GPy authors (see AUTHORS.txt).`	`# #Copyright (c) 2012, GPy authors (see AUTHORS.txt).`
	`# Licensed under the BSD 3-clause license (see LICENSE.txt)`	`# Licensed under the BSD 3-clause license (see LICENSE.txt)`