diff --git a/GPy/__init__.py b/GPy/__init__.py index 04129139..f35fda78 100644 --- a/GPy/__init__.py +++ b/GPy/__init__.py @@ -5,6 +5,7 @@ warnings.filterwarnings("ignore", category=DeprecationWarning) import core import models +import mappings import inference import util import examples diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py index e9e049b0..9813d5ae 100644 --- a/GPy/core/__init__.py +++ b/GPy/core/__init__.py @@ -2,9 +2,10 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) from model import * -from parameterised import * +from parameterized import * import priors from gp import GP from sparse_gp import SparseGP from fitc import FITC from svigp import SVIGP +from mapping import * diff --git a/GPy/core/domains.py b/GPy/core/domains.py index dfc880f6..cefac6c2 100644 --- a/GPy/core/domains.py +++ b/GPy/core/domains.py @@ -2,6 +2,22 @@ Created on 4 Jun 2013 @author: maxz + +(Hyper-)Parameter domains defined for :py:mod:`~GPy.core.priors` and :py:mod:`~GPy.kern`. +These domains specify the legitimate realm of the parameters to live in. + +:const:`~GPy.core.domains.REAL` : + real domain, all values in the real numbers are allowed + +:const:`~GPy.core.domains.POSITIVE`: + positive domain, only positive real values are allowed + +:const:`~GPy.core.domains.NEGATIVE`: + same as :const:`~GPy.core.domains.POSITIVE`, but only negative values are allowed + +:const:`~GPy.core.domains.BOUNDED`: + only values within the bounded range are allowed, + the bounds are specified withing the object with the bounded range ''' REAL = 'real' diff --git a/GPy/core/gp.py b/GPy/core/gp.py index 5172d9e7..278ddc74 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -6,7 +6,6 @@ import numpy as np import pylab as pb from .. import kern from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs -#from ..util.plot import gpplot, Tango from ..likelihoods import EP from gp_base import GPBase @@ -31,6 +30,13 @@ class GP(GPBase): GPBase.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) self._set_params(self._get_params()) + def getstate(self): + return GPBase.getstate(self) + + def setstate(self, state): + GPBase.setstate(self, state) + self._set_params(self._get_params()) + def _set_params(self, p): self.kern._set_params_transformed(p[:self.kern.num_params_transformed()]) self.likelihood._set_params(p[self.kern.num_params_transformed():]) @@ -42,12 +48,12 @@ class GP(GPBase): # the gradient of the likelihood wrt the covariance matrix if self.likelihood.YYT is None: - #alpha = np.dot(self.Ki, self.likelihood.Y) - alpha,_ = dpotrs(self.L, self.likelihood.Y,lower=1) + # alpha = np.dot(self.Ki, self.likelihood.Y) + alpha, _ = dpotrs(self.L, self.likelihood.Y, lower=1) self.dL_dK = 0.5 * (tdot(alpha) - self.output_dim * self.Ki) else: - #tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki) + # tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki) tmp, _ = dpotrs(self.L, np.asfortranarray(self.likelihood.YYT), lower=1) tmp, _ = dpotrs(self.L, np.asfortranarray(tmp.T), lower=1) self.dL_dK = 0.5 * (tmp - self.output_dim * self.Ki) @@ -68,7 +74,7 @@ class GP(GPBase): """ self.likelihood.restart() self.likelihood.fit_full(self.kern.K(self.X)) - self._set_params(self._get_params()) # update the GP + self._set_params(self._get_params()) # update the GP def _model_fit_term(self): """ @@ -77,7 +83,7 @@ class GP(GPBase): if self.likelihood.YYT is None: tmp, _ = dtrtrs(self.L, np.asfortranarray(self.likelihood.Y), lower=1) return -0.5 * np.sum(np.square(tmp)) - #return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y))) + # return -0.5 * np.sum(np.square(np.dot(self.Li, self.likelihood.Y))) else: return -0.5 * np.sum(np.multiply(self.Ki, self.likelihood.YYT)) @@ -89,7 +95,8 @@ class GP(GPBase): model for a new variable Y* = v_tilde/tau_tilde, with a covariance matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ - return -0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z + return (-0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) - + 0.5 * self.output_dim * self.K_logdet + self._model_fit_term() + self.likelihood.Z) def _log_likelihood_gradients(self): @@ -100,13 +107,13 @@ class GP(GPBase): """ return np.hstack((self.kern.dK_dtheta(dL_dK=self.dL_dK, X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) - def _raw_predict(self, _Xnew, which_parts='all', full_cov=False,stop=False): + def _raw_predict(self, _Xnew, which_parts='all', full_cov=False, stop=False): """ Internal helper function for making predictions, does not account for normalization or likelihood """ - Kx = self.kern.K(_Xnew,self.X,which_parts=which_parts).T - #KiKx = np.dot(self.Ki, Kx) + Kx = self.kern.K(_Xnew, self.X, which_parts=which_parts).T + # KiKx = np.dot(self.Ki, Kx) KiKx, _ = dpotrs(self.L, np.asfortranarray(Kx), lower=1) mu = np.dot(KiKx.T, self.likelihood.Y) if full_cov: @@ -120,7 +127,7 @@ class GP(GPBase): debug_this # @UndefinedVariable return mu, var - def predict(self, Xnew, which_parts='all', full_cov=False): + def predict(self, Xnew, which_parts='all', full_cov=False, likelihood_args=dict()): """ Predict the function(s) at the new point(s) Xnew. Arguments @@ -145,6 +152,6 @@ class GP(GPBase): mu, var = self._raw_predict(Xnew, full_cov=full_cov, which_parts=which_parts) # now push through likelihood - mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) + mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov, **likelihood_args) return mean, var, _025pm, _975pm diff --git a/GPy/core/gp_base.py b/GPy/core/gp_base.py index b82f3298..6935fc6f 100644 --- a/GPy/core/gp_base.py +++ b/GPy/core/gp_base.py @@ -6,8 +6,8 @@ from GPy.core.model import Model class GPBase(Model): """ - Gaussian Process Model for holding shared behaviour between - sprase_GP and GP models + Gaussian process base model for holding shared behaviour between + sparse_GP and GP models. """ def __init__(self, X, likelihood, kernel, normalize_X=False): @@ -29,23 +29,39 @@ class GPBase(Model): self._Xscale = np.ones((1, self.input_dim)) super(GPBase, self).__init__() - #Model.__init__(self) + # Model.__init__(self) # All leaf nodes should call self._set_params(self._get_params()) at # the end + def getstate(self): + """ + Get the current state of the class, here we return everything that is needed to recompute the model. + """ + return Model.getstate(self) + [self.X, + self.num_data, + self.input_dim, + self.kern, + self.likelihood, + self.output_dim, + self._Xoffset, + self._Xscale] + + def setstate(self, state): + self._Xscale = state.pop() + self._Xoffset = state.pop() + self.output_dim = state.pop() + self.likelihood = state.pop() + self.kern = state.pop() + self.input_dim = state.pop() + self.num_data = state.pop() + self.X = state.pop() + Model.setstate(self, state) + def plot_f(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, full_cov=False, fignum=None, ax=None): """ Plot the GP's view of the world, where the data is normalized and the likelihood is Gaussian. - :param samples: the number of a posteriori samples to plot - :param which_data: which if the training data to plot (default all) - :type which_data: 'all' or a slice object to slice self.X, self.Y - :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits - :param which_parts: which of the kernel functions to plot (additively) - :type which_parts: 'all', or list of bools - :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D - Plot the posterior of the GP. - In one dimension, the function is plotted with a shaded region identifying two standard deviations. - In two dimsensions, a contour-plot shows the mean predicted function @@ -53,6 +69,22 @@ class GPBase(Model): Can plot only part of the data and part of the posterior functions using which_data and which_functions + + :param samples: the number of a posteriori samples to plot + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param which_parts: which of the kernel functions to plot (additively) + :type which_parts: 'all', or list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + :type resolution: int + :param full_cov: + :type full_cov: bool + :param fignum: figure to plot on. + :type fignum: figure number + :param ax: axes to plot on. + :type ax: axes handle + """ if which_data == 'all': which_data = slice(None) @@ -91,12 +123,43 @@ class GPBase(Model): else: raise NotImplementedError, "Cannot define a frame with more than two input dimensions" - def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None): + def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue']): """ - TODO: Docstrings! - + Plot the GP with noise where the likelihood is Gaussian. + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions + using which_data and which_functions + + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :type plot_limits: np.array + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param which_parts: which of the kernel functions to plot (additively) + :type which_parts: 'all', or list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + :type resolution: int + :param levels: number of levels to plot in a contour plot. + :type levels: int + :param samples: the number of a posteriori samples to plot + :type samples: int + :param fignum: figure to plot on. + :type fignum: figure number + :param ax: axes to plot on. + :type ax: axes handle + :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v. + :type fixed_inputs: a list of tuples + :param linecol: color of line to plot. + :type linecol: + :param fillcol: color of fill + :type fillcol: :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure + """ # TODO include samples if which_data == 'all': @@ -106,15 +169,25 @@ class GPBase(Model): fig = pb.figure(num=fignum) ax = fig.add_subplot(111) - if self.X.shape[1] == 1: + plotdims = self.input_dim - len(fixed_inputs) + + if plotdims == 1: Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now - Xnew, xmin, xmax = x_frame1D(Xu, plot_limits=plot_limits) - m, _, lower, upper = self.predict(Xnew, which_parts=which_parts) + fixed_dims = np.array([i for i,v in fixed_inputs]) + freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims) + + Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits) + Xgrid = np.empty((Xnew.shape[0],self.input_dim)) + Xgrid[:,freedim] = Xnew + for i,v in fixed_inputs: + Xgrid[:,i] = v + + m, _, lower, upper = self.predict(Xgrid, which_parts=which_parts) for d in range(m.shape[1]): - gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax) - ax.plot(Xu[which_data], self.likelihood.data[which_data, d], 'kx', mew=1.5) + gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], axes=ax, edgecol=linecol, fillcol=fillcol) + ax.plot(Xu[which_data,freedim], self.likelihood.data[which_data, d], 'kx', mew=1.5) ymin, ymax = min(np.append(self.likelihood.data, lower)), max(np.append(self.likelihood.data, upper)) ymin, ymax = ymin - 0.1 * (ymax - ymin), ymax + 0.1 * (ymax - ymin) ax.set_xlim(xmin, xmax) @@ -127,7 +200,7 @@ class GPBase(Model): m, _, lower, upper = self.predict(Xnew, which_parts=which_parts) m = m.reshape(resolution, resolution).T ax.contour(x, y, m, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable - Yf = self.likelihood.Y.flatten() + Yf = self.likelihood.data.flatten() ax.scatter(self.X[:, 0], self.X[:, 1], 40, Yf, cmap=pb.cm.jet, vmin=m.min(), vmax=m.max(), linewidth=0.) # @UndefinedVariable ax.set_xlim(xmin[0], xmax[0]) ax.set_ylim(xmin[1], xmax[1]) diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py new file mode 100644 index 00000000..02b9664a --- /dev/null +++ b/GPy/core/mapping.py @@ -0,0 +1,190 @@ +# Copyright (c) 2013, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from ..util.plot import Tango, x_frame1D, x_frame2D +from parameterized import Parameterized +import numpy as np +import pylab as pb + +class Mapping(Parameterized): + """ + Base model for shared behavior between models that can act like a mapping. + """ + + def __init__(self, input_dim, output_dim): + self.input_dim = input_dim + self.output_dim = output_dim + + super(Mapping, self).__init__() + # Model.__init__(self) + # All leaf nodes should call self._set_params(self._get_params()) at + # the end + + def f(self, X): + raise NotImplementedError + + def df_dX(self, dL_df, X): + """Evaluate derivatives of mapping outputs with respect to inputs. + + :param dL_df: gradient of the objective with respect to the function. + :type dL_df: ndarray (num_data x output_dim) + :param X: the input locations where derivatives are to be evaluated. + :type X: ndarray (num_data x input_dim) + :returns: matrix containing gradients of the function with respect to the inputs. + """ + raise NotImplementedError + + def df_dtheta(self, dL_df, X): + """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters. + + :param dL_df: gradient of the objective with respect to the function. + :type dL_df: ndarray (num_data x output_dim) + :param X: input locations where the function is evaluated. + :type X: ndarray (num_data x input_dim) + :returns: Matrix containing gradients with respect to parameters of each output for each input data. + :rtype: ndarray (num_params length) + """ + + raise NotImplementedError + + def plot(self, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, samples=0, fignum=None, ax=None, fixed_inputs=[], linecol=Tango.colorsHex['darkBlue']): + """ + Plot the mapping. + + Plots the mapping associated with the model. + - In one dimension, the function is plotted. + - In two dimsensions, a contour-plot shows the function + - In higher dimensions, we've not implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions + using which_data and which_functions + + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :type plot_limits: np.array + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param which_parts: which of the kernel functions to plot (additively) + :type which_parts: 'all', or list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + :type resolution: int + :param levels: number of levels to plot in a contour plot. + :type levels: int + :param samples: the number of a posteriori samples to plot + :type samples: int + :param fignum: figure to plot on. + :type fignum: figure number + :param ax: axes to plot on. + :type ax: axes handle + :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v. + :type fixed_inputs: a list of tuples + :param linecol: color of line to plot. + :type linecol: + :param levels: for 2D plotting, the number of contour levels to use + is ax is None, create a new figure + + """ + # TODO include samples + if which_data == 'all': + which_data = slice(None) + + if ax is None: + fig = pb.figure(num=fignum) + ax = fig.add_subplot(111) + + plotdims = self.input_dim - len(fixed_inputs) + + if plotdims == 1: + + Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now + + fixed_dims = np.array([i for i,v in fixed_inputs]) + freedim = np.setdiff1d(np.arange(self.input_dim),fixed_dims) + + Xnew, xmin, xmax = x_frame1D(Xu[:,freedim], plot_limits=plot_limits) + Xgrid = np.empty((Xnew.shape[0],self.input_dim)) + Xgrid[:,freedim] = Xnew + for i,v in fixed_inputs: + Xgrid[:,i] = v + + f = self.predict(Xgrid, which_parts=which_parts) + for d in range(y.shape[1]): + ax.plot(Xnew, f[:, d], edgecol=linecol) + + elif self.X.shape[1] == 2: + resolution = resolution or 50 + Xnew, _, _, xmin, xmax = x_frame2D(self.X, plot_limits, resolution) + x, y = np.linspace(xmin[0], xmax[0], resolution), np.linspace(xmin[1], xmax[1], resolution) + f = self.predict(Xnew, which_parts=which_parts) + m = m.reshape(resolution, resolution).T + ax.contour(x, y, f, levels, vmin=m.min(), vmax=m.max(), cmap=pb.cm.jet) # @UndefinedVariable + ax.set_xlim(xmin[0], xmax[0]) + ax.set_ylim(xmin[1], xmax[1]) + + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + +from GPy.core.model import Model + +class Mapping_check_model(Model): + """This is a dummy model class used as a base class for checking that the gradients of a given mapping are implemented correctly. It enables checkgradient() to be called independently on each mapping.""" + def __init__(self, mapping=None, dL_df=None, X=None): + num_samples = 20 + if mapping==None: + mapping = GPy.mapping.linear(1, 1) + if X==None: + X = np.random.randn(num_samples, mapping.input_dim) + if dL_df==None: + dL_df = np.ones((num_samples, mapping.output_dim)) + + self.mapping=mapping + self.X = X + self.dL_df = dL_df + self.num_params = self.mapping.num_params + Model.__init__(self) + + + def _get_params(self): + return self.mapping._get_params() + + def _get_param_names(self): + return self.mapping._get_param_names() + + def _set_params(self, x): + self.mapping._set_params(x) + + def log_likelihood(self): + return (self.dL_df*self.mapping.f(self.X)).sum() + + def _log_likelihood_gradients(self): + raise NotImplementedError, "This needs to be implemented to use the Mapping_check_model class." + +class Mapping_check_df_dtheta(Mapping_check_model): + """This class allows gradient checks for the gradient of a mapping with respect to parameters. """ + def __init__(self, mapping=None, dL_df=None, X=None): + Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X) + + def _log_likelihood_gradients(self): + return self.mapping.df_dtheta(self.dL_df, self.X) + + +class Mapping_check_df_dX(Mapping_check_model): + """This class allows gradient checks for the gradient of a mapping with respect to X. """ + def __init__(self, mapping=None, dL_df=None, X=None): + Mapping_check_model.__init__(self,mapping=mapping,dL_df=dL_df, X=X) + + if dL_df==None: + dL_df = np.ones((self.X.shape[0],self.mapping.output_dim)) + self.num_params = self.X.shape[0]*self.mapping.input_dim + + def _log_likelihood_gradients(self): + return self.mapping.df_dX(self.dL_df, self.X).flatten() + + def _get_param_names(self): + return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] + + def _get_params(self): + return self.X.flatten() + + def _set_params(self, x): + self.X=x.reshape(self.X.shape) + diff --git a/GPy/core/model.py b/GPy/core/model.py index 05375b2a..2e70e7c2 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -6,49 +6,72 @@ from .. import likelihoods from ..inference import optimization from ..util.linalg import jitchol from GPy.util.misc import opt_wrapper -from parameterised import Parameterised +from parameterized import Parameterized import multiprocessing as mp import numpy as np from GPy.core.domains import POSITIVE, REAL from numpy.linalg.linalg import LinAlgError # import numdifftools as ndt -class Model(Parameterised): +class Model(Parameterized): _fail_count = 0 # Count of failed optimization steps (see objective) _allowed_failures = 10 # number of allowed failures def __init__(self): - Parameterised.__init__(self) + Parameterized.__init__(self) self.priors = None self.optimization_runs = [] self.sampling_runs = [] self.preferred_optimizer = 'scg' # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes - def _get_params(self): - raise NotImplementedError, "this needs to be implemented to use the Model class" - def _set_params(self, x): - raise NotImplementedError, "this needs to be implemented to use the Model class" def log_likelihood(self): - raise NotImplementedError, "this needs to be implemented to use the Model class" + raise NotImplementedError, "this needs to be implemented to use the model class" def _log_likelihood_gradients(self): - raise NotImplementedError, "this needs to be implemented to use the Model class" + raise NotImplementedError, "this needs to be implemented to use the model class" + + def getstate(self): + """ + Get the current state of the class. + + Inherited from Parameterized, so add those parameters to the state + :return: list of states from the model. + + """ + return Parameterized.getstate(self) + \ + [self.priors, self.optimization_runs, + self.sampling_runs, self.preferred_optimizer] + + def setstate(self, state): + """ + set state from previous call to getstate + call Parameterized with the rest of the state + + :param state: the state of the model. + :type state: list as returned from getstate. + """ + self.preferred_optimizer = state.pop() + self.sampling_runs = state.pop() + self.optimization_runs = state.pop() + self.priors = state.pop() + Parameterized.setstate(self, state) def set_prior(self, regexp, what): """ - Sets priors on the Model parameters. - - Arguments - --------- - regexp -- string, regexp, or integer array - what -- instance of a Prior class + Sets priors on the model parameters. Notes ----- - Asserts that the Prior is suitable for the constraint. If the + Asserts that the prior is suitable for the constraint. If the wrong constraint is in place, an error is raised. If no constraint is in place, one is added (warning printed). - For tied parameters, the Prior will only be "counted" once, thus - a Prior object is only inserted on the first tied index + For tied parameters, the prior will only be "counted" once, thus + a prior object is only inserted on the first tied index + + :param regexp: regular expression of parameters on which priors need to be set. + :type param: string, regexp, or integer array + :param what: prior to set on parameter. + :type what: GPy.core.Prior type + """ if self.priors is None: self.priors = [None for i in range(self._get_params().size)] @@ -58,12 +81,12 @@ class Model(Parameterised): # check tied situation tie_partial_matches = [tie for tie in self.tied_indices if (not set(tie).isdisjoint(set(which))) & (not set(tie) == set(which))] if len(tie_partial_matches): - raise ValueError, "cannot place Prior across partial ties" + raise ValueError, "cannot place prior across partial ties" tie_matches = [tie for tie in self.tied_indices if set(which) == set(tie) ] if len(tie_matches) > 1: - raise ValueError, "cannot place Prior across multiple ties" + raise ValueError, "cannot place prior across multiple ties" elif len(tie_matches) == 1: - which = which[:1] # just place a Prior object on the first parameter + which = which[:1] # just place a prior object on the first parameter # check constraints are okay @@ -75,7 +98,7 @@ class Model(Parameterised): else: constrained_positive_indices = np.zeros(shape=(0,)) bad_constraints = np.setdiff1d(self.all_constrained_indices(), constrained_positive_indices) - assert not np.any(which[:, None] == bad_constraints), "constraint and Prior incompatible" + assert not np.any(which[:, None] == bad_constraints), "constraint and prior incompatible" unconst = np.setdiff1d(which, constrained_positive_indices) if len(unconst): print "Warning: constraining parameters to be positive:" @@ -83,17 +106,22 @@ class Model(Parameterised): print '\n' self.constrain_positive(unconst) elif what.domain is REAL: - assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and Prior incompatible" + assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and prior incompatible" else: - raise ValueError, "Prior not recognised" + raise ValueError, "prior not recognised" - # store the Prior in a local list + # store the prior in a local list for w in which: self.priors[w] = what def get_gradient(self, name, return_names=False): """ - Get Model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. + Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. + + :param name: the name of parameters required (as a regular expression). + :type name: regular expression + :param return_names: whether or not to return the names matched (default False) + :type return_names: bool """ matches = self.grep_param_names(name) if len(matches): @@ -133,14 +161,14 @@ class Model(Parameterised): def randomize(self): """ - Randomize the Model. - Make this draw from the Prior if one exists, else draw from N(0,1) + Randomize the model. + Make this draw from the prior if one exists, else draw from N(0,1) """ # first take care of all parameters (from N(0,1)) x = self._get_params_transformed() x = np.random.randn(x.size) self._set_params_transformed(x) - # now draw from Prior where possible + # now draw from prior where possible x = self._get_params() if self.priors is not None: [np.put(x, i, p.rvs(1)) for i, p in enumerate(self.priors) if not p is None] @@ -150,21 +178,30 @@ class Model(Parameterised): def optimize_restarts(self, num_restarts=10, robust=False, verbose=True, parallel=False, num_processes=None, **kwargs): """ - Perform random restarts of the Model, and set the Model to the best + Perform random restarts of the model, and set the model to the best seen solution. If the robust flag is set, exceptions raised during optimizations will - be handled silently. If _all_ runs fail, the Model is reset to the + be handled silently. If _all_ runs fail, the model is reset to the existing parameter values. Notes ----- + :param num_restarts: number of restarts to use (default 10) + :type num_restarts: int + :param robust: whether to handle exceptions silently or not (default False) + :type robust: bool + :param parallel: whether to run each restart as a separate process. It relies on the multiprocessing module. + :type parallel: bool + :param num_processes: number of workers in the multiprocessing pool + :type numprocesses: int **kwargs are passed to the optimizer. They can be: - :max_f_eval: maximum number of function evaluations - :messages: whether to display during optimisation - :verbose: whether to show informations about the current restart - :parallel: whether to run each restart as a separate process. It relies on the multiprocessing module. - :num_processes: number of workers in the multiprocessing pool + :param max_f_eval: maximum number of function evaluations + :type max_f_eval: int + :param max_iters: maximum number of iterations + :type max_iters: int + :param messages: whether to display during optimisation + :type messages: bool ..Note: If num_processes is None, the number of workes in the multiprocessing pool is automatically set to the number of processors on the current machine. @@ -212,8 +249,13 @@ class Model(Parameterised): self._set_params_transformed(initial_parameters) def ensure_default_constraints(self): - """ - Ensure that any variables which should clearly be positive have been constrained somehow. + """ + Ensure that any variables which should clearly be positive + have been constrained somehow. The method performs a regular + expression search on parameter names looking for the terms + 'variance', 'lengthscale', 'precision' and 'kappa'. If any of + these terms are present in the name the parameter is + constrained positive. """ positive_strings = ['variance', 'lengthscale', 'precision', 'kappa'] # param_names = self._get_param_names() @@ -228,11 +270,15 @@ class Model(Parameterised): def objective_function(self, x): """ - The objective function passed to the optimizer. It combines the likelihood and the priors. + The objective function passed to the optimizer. It combines + the likelihood and the priors. Failures are handled robustly. The algorithm will try several times to return the objective, and will raise the original exception if it the objective cannot be computed. + + :param x: the parameters of the model. + :parameter type: np.array """ try: self._set_params_transformed(x) @@ -249,39 +295,53 @@ class Model(Parameterised): Gets the gradients from the likelihood and the priors. Failures are handled robustly. The algorithm will try several times to - return the objective, and will raise the original exception if it + return the gradients, and will raise the original exception if it the objective cannot be computed. + + :param x: the parameters of the model. + :parameter type: np.array """ try: self._set_params_transformed(x) + obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) self._fail_count = 0 except (LinAlgError, ZeroDivisionError, ValueError) as e: if self._fail_count >= self._allowed_failures: raise e self._fail_count += 1 - obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) + obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100) return obj_grads def objective_and_gradients(self, x): + """ + Compute the objective function of the model and the gradient of the model at the point given by x. + + :param x: the point at which gradients are to be computed. + :type np.array: + """ + try: self._set_params_transformed(x) obj_f = -self.log_likelihood() - self.log_prior() self._fail_count = 0 + obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) except (LinAlgError, ZeroDivisionError, ValueError) as e: if self._fail_count >= self._allowed_failures: raise e self._fail_count += 1 obj_f = np.inf - obj_grads = -self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) + obj_grads = np.clip(-self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()), -1e100, 1e100) return obj_f, obj_grads def optimize(self, optimizer=None, start=None, **kwargs): """ - Optimize the Model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors. + Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors. kwargs are passed to the optimizer. They can be: - :max_f_eval: maximum number of function evaluations + :param max_f_eval: maximum number of function evaluations + :type max_f_eval: int :messages: whether to display during optimisation + :type messages: bool :param optimzer: which optimizer to use (defaults to self.preferred optimizer) :type optimzer: string TODO: valid strings? """ @@ -293,7 +353,9 @@ class Model(Parameterised): optimizer = optimization.get_optimizer(optimizer) opt = optimizer(start, model=self, **kwargs) + opt.run(f_fp=self.objective_and_gradients, f=self.objective_function, fp=self.objective_function_gradients) + self.optimization_runs.append(opt) self._set_params_transformed(opt.x_opt) @@ -305,14 +367,14 @@ class Model(Parameterised): self.optimization_runs.append(sgd) def Laplace_covariance(self): - """return the covariance matric of a Laplace approximatino at the current (stationary) point""" - # TODO add in the Prior contributions for MAP estimation + """return the covariance matrix of a Laplace approximation at the current (stationary) point.""" + # TODO add in the prior contributions for MAP estimation # TODO fix the hessian for tied, constrained and fixed components if hasattr(self, 'log_likelihood_hessian'): A = -self.log_likelihood_hessian() else: - print "numerically calculating hessian. please be patient!" + print "numerically calculating Hessian. please be patient!" x = self._get_params() def f(x): self._set_params(x) @@ -326,8 +388,8 @@ class Model(Parameterised): return A def Laplace_evidence(self): - """Returns an estiamte of the Model evidence based on the Laplace approximation. - Uses a numerical estimate of the hessian if none is available analytically""" + """Returns an estiamte of the model evidence based on the Laplace approximation. + Uses a numerical estimate of the Hessian if none is available analytically.""" A = self.Laplace_covariance() try: hld = np.sum(np.log(np.diag(jitchol(A)[0]))) @@ -335,40 +397,45 @@ class Model(Parameterised): return np.nan return 0.5 * self._get_params().size * np.log(2 * np.pi) + self.log_likelihood() - hld - def __str__(self): - s = Parameterised.__str__(self).split('\n') + def __str__(self, names=None): + if names is None: + names = self._get_print_names() + s = Parameterized.__str__(self, names=names).split('\n') # add priors to the string if self.priors is not None: strs = [str(p) if p is not None else '' for p in self.priors] else: - strs = [''] * len(self._get_params()) + strs = [''] * len(self._get_param_names()) + name_indices = self.grep_param_names("|".join(names)) + strs = np.array(strs)[name_indices] width = np.array(max([len(p) for p in strs] + [5])) + 4 log_like = self.log_likelihood() log_prior = self.log_prior() obj_funct = '\nLog-likelihood: {0:.3e}'.format(log_like) if len(''.join(strs)) != 0: - obj_funct += ', Log Prior: {0:.3e}, LL+Prior = {0:.3e}'.format(log_prior, log_like + log_prior) + obj_funct += ', Log prior: {0:.3e}, LL+prior = {0:.3e}'.format(log_prior, log_like + log_prior) obj_funct += '\n\n' s[0] = obj_funct + s[0] - s[0] += "|{h:^{col}}".format(h='Prior', col=width) + s[0] += "|{h:^{col}}".format(h='prior', col=width) s[1] += '-' * (width + 1) for p in range(2, len(strs) + 2): - s[p] += '|{Prior:^{width}}'.format(Prior=strs[p - 2], width=width) + s[p] += '|{prior:^{width}}'.format(prior=strs[p - 2], width=width) return '\n'.join(s) def checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3): """ - Check the gradient of the Model by comparing to a numerical estimate. - If the verbose flag is passed, invividual components are tested (and printed) + Check the gradient of the ,odel by comparing to a numerical + estimate. If the verbose flag is passed, invividual + components are tested (and printed) :param verbose: If True, print a "full" checking of each parameter :type verbose: bool :param step: The size of the step around which to linearise the objective - :type step: float (defaul 1e-6) + :type step: float (default 1e-6) :param tolerance: the tolerance allowed (see note) :type tolerance: float (default 1e-3) @@ -391,7 +458,7 @@ class Model(Parameterised): numerical_gradient = (f1 - f2) / (2 * dx) global_ratio = (f1 - f2) / (2 * np.dot(dx, gradient)) - return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() - 1) < tolerance + return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance) else: # check the gradient of each parameter individually, and do some pretty printing try: @@ -445,25 +512,27 @@ class Model(Parameterised): def input_sensitivity(self): """ - return an array describing the sesitivity of the Model to each input + return an array describing the sesitivity of the model to each input NB. Right now, we're basing this on the lengthscales (or variances) of the kernel. TODO: proper sensitivity analysis - where we integrate across the Model inputs and evaluate the - effect on the variance of the Model output. """ + where we integrate across the model inputs and evaluate the + effect on the variance of the model output. """ if not hasattr(self, 'kern'): - raise ValueError, "this Model has no kernel" + raise ValueError, "this model has no kernel" - k = [p for p in self.kern.parts if p.name in ['rbf', 'linear']] + k = [p for p in self.kern.parts if p.name in ['rbf', 'linear', 'rbf_inv']] if (not len(k) == 1) or (not k[0].ARD): raise ValueError, "cannot determine sensitivity for this kernel" k = k[0] if k.name == 'rbf': - return k.lengthscale + return 1. / k.lengthscale + elif k.name == 'rbf_inv': + return k.inv_lengthscale elif k.name == 'linear': - return 1. / k.variances + return k.variances def pseudo_EM(self, epsilon=.1, **kwargs): diff --git a/GPy/core/parameterised.py b/GPy/core/parameterized.py similarity index 80% rename from GPy/core/parameterised.py rename to GPy/core/parameterized.py index b3a5712a..cad4d2a9 100644 --- a/GPy/core/parameterised.py +++ b/GPy/core/parameterized.py @@ -9,7 +9,7 @@ import cPickle import warnings import transformations -class Parameterised(object): +class Parameterized(object): def __init__(self): """ This is the base class for model and kernel. Mostly just handles tieing and constraining of parameters @@ -20,55 +20,66 @@ class Parameterised(object): self.constrained_indices = [] self.constraints = [] - def pickle(self, filename, protocol= -1): - f = file(filename, 'w') - cPickle.dump(self, f, protocol) - f.close() + def _get_params(self): + raise NotImplementedError, "this needs to be implemented to use the Parameterized class" + def _set_params(self, x): + raise NotImplementedError, "this needs to be implemented to use the Parameterized class" + + def _get_param_names(self): + raise NotImplementedError, "this needs to be implemented to use the Parameterized class" + def _get_print_names(self): + """ Override for which names to print out, when using print m """ + return self._get_param_names() + + def pickle(self, filename, protocol=None): + if protocol is None: + if self._has_get_set_state(): + protocol = 0 + else: + protocol = -1 + with open(filename, 'w') as f: + cPickle.dump(self, f, protocol) def copy(self): """Returns a (deep) copy of the current model """ return copy.deepcopy(self) - @property - def params(self): + def __getstate__(self): + if self._has_get_set_state(): + return self.getstate() + return self.__dict__ + + def __setstate__(self, state): + if self._has_get_set_state(): + self.setstate(state) # set state + self._set_params(self._get_params()) # restore all values + return + self.__dict__ = state + + def _has_get_set_state(self): + return 'getstate' in vars(self.__class__) and 'setstate' in vars(self.__class__) + + def getstate(self): """ - Returns a **copy** of parameters in non transformed space - - :see_also: :py:func:`GPy.core.Parameterised.params_transformed` + Get the current state of the class, + here just all the indices, rest can get recomputed + + For inheriting from Parameterized: + Allways append the state of the inherited object + and call down to the inherited object in setstate!! """ - return self._get_params() + return [self.tied_indices, + self.fixed_indices, + self.fixed_values, + self.constrained_indices, + self.constraints] - @params.setter - def params(self, params): - self._set_params(params) - - @property - def params_transformed(self): - """ - Returns a **copy** of parameters in transformed space - - :see_also: :py:func:`GPy.core.Parameterised.params` - """ - return self._get_params_transformed() - - @params_transformed.setter - def params_transformed(self, params): - self._set_params_transformed(params) - - _get_set_deprecation = """get and set methods wont be available at next minor release - in the next releases you will get and set with following syntax: - Assume m is a model class: - print m['var'] # > prints all parameters matching 'var' - m['var'] = 2. # > sets all parameters matching 'var' to 2. - m['var'] = # > sets parameters matching 'var' to - """ - def get(self, regexp): - warnings.warn(self._get_set_deprecation, FutureWarning, stacklevel=2) - return self[regexp] - - def set(self, regexp, val): - warnings.warn(self._get_set_deprecation, FutureWarning, stacklevel=2) - self[regexp] = val + def setstate(self, state): + self.constraints = state.pop() + self.constrained_indices = state.pop() + self.fixed_values = state.pop() + self.fixed_indices = state.pop() + self.tied_indices = state.pop() def __getitem__(self, regexp, return_names=False): """ @@ -95,13 +106,16 @@ class Parameterised(object): if len(matches): val = np.array(val) assert (val.size == 1) or val.size == len(matches), "Shape mismatch: {}:({},)".format(val.size, len(matches)) - x = self.params + x = self._get_params() x[matches] = val - self.params = x + self._set_params(x) else: raise AttributeError, "no parameter matches %s" % name def tie_params(self, regexp): + """ + Tie (all!) parameters matching the regular expression `regexp`. + """ matches = self.grep_param_names(regexp) assert matches.size > 0, "need at least something to tie together" if len(self.tied_indices): @@ -154,7 +168,7 @@ class Parameterised(object): return len(self._get_params()) - removed def unconstrain(self, regexp): - """Unconstrain matching parameters. does not untie parameters""" + """Unconstrain matching parameters. Does not untie parameters""" matches = self.grep_param_names(regexp) # tranformed contraints: @@ -181,7 +195,7 @@ class Parameterised(object): def constrain_negative(self, regexp): """ Set negative constraints. """ - self.constrain(regexp, transformations.negative_exponent()) + self.constrain(regexp, transformations.negative_logexp()) def constrain_positive(self, regexp): """ Set positive constraints. """ @@ -219,10 +233,11 @@ class Parameterised(object): """ Arguments --------- - :param regexp: np.array(dtype=int), or regular expression object or string - :param value: a float to fix the matched values to. If the value is not specified, + :param regexp: which parameters need to be fixed. + :type regexp: ndarray(dtype=int) or regular expression object or string + :param value: the vlaue to fix the parameters to. If the value is not specified, the parameter is fixed to the current value - + :type value: float Notes ----- Fixing a parameter which is tied to another, or constrained in some way will result in an error. @@ -321,19 +336,26 @@ class Parameterised(object): n = [nn for i, nn in enumerate(n) if not i in remove] return n - def __str__(self, nw=30): + @property + def all(self): + return self.__str__(self._get_param_names()) + + + def __str__(self, names=None, nw=30): """ Return a string describing the parameter names and their ties and constraints """ - names = self._get_param_names() + if names is None: + names = self._get_print_names() + name_indices = self.grep_param_names("|".join(names)) N = len(names) if not N: return "This object has no free parameters." header = ['Name', 'Value', 'Constraints', 'Ties'] - values = self._get_params() # map(str,self._get_params()) + values = self._get_params()[name_indices] # map(str,self._get_params()) # sort out the constraints - constraints = [''] * len(names) + constraints = [''] * len(self._get_param_names()) for i, t in zip(self.constrained_indices, self.constraints): for ii in i: constraints[ii] = t.__str__() diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index 04119071..40cfd404 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -33,10 +33,11 @@ class SparseGP(GPBase): self.Z = Z self.num_inducing = Z.shape[0] - self.likelihood = likelihood +# self.likelihood = likelihood if X_variance is None: self.has_uncertain_inputs = False + self.X_variance = None else: assert X_variance.shape == X.shape self.has_uncertain_inputs = True @@ -49,6 +50,25 @@ class SparseGP(GPBase): if self.has_uncertain_inputs: self.X_variance /= np.square(self._Xscale) + self._const_jitter = None + + def getstate(self): + """ + Get the current state of the class, + here just all the indices, rest can get recomputed + """ + return GPBase.getstate(self) + [self.Z, + self.num_inducing, + self.has_uncertain_inputs, + self.X_variance] + + def setstate(self, state): + self.X_variance = state.pop() + self.has_uncertain_inputs = state.pop() + self.num_inducing = state.pop() + self.Z = state.pop() + GPBase.setstate(self, state) + def _compute_kernel_matrices(self): # kernel computations, using BGPLVM notation self.Kmm = self.kern.K(self.Z) @@ -62,12 +82,14 @@ class SparseGP(GPBase): self.psi2 = None def _computations(self): - + if self._const_jitter is None or not(self._const_jitter.shape[0] == self.num_inducing): + self._const_jitter = np.eye(self.num_inducing) * 1e-7 # factor Kmm - self.Lm = jitchol(self.Kmm) + self._Lm = jitchol(self.Kmm + self._const_jitter) + # TODO: no white kernel needed anymore, all noise in likelihood -------- - # The rather complex computations of self.A + # The rather complex computations of self._A if self.has_uncertain_inputs: if self.likelihood.is_heteroscedastic: psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.num_data, 1, 1))).sum(0) @@ -75,29 +97,32 @@ class SparseGP(GPBase): psi2_beta = self.psi2.sum(0) * self.likelihood.precision evals, evecs = linalg.eigh(psi2_beta) clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable + if not np.array_equal(evals, clipped_evals): + pass # print evals tmp = evecs * np.sqrt(clipped_evals) tmp = tmp.T else: if self.likelihood.is_heteroscedastic: - tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data,1))) + tmp = self.psi1 * (np.sqrt(self.likelihood.precision.flatten().reshape(self.num_data, 1))) else: tmp = self.psi1 * (np.sqrt(self.likelihood.precision)) - tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp.T), lower=1) - self.A = tdot(tmp) + tmp, _ = dtrtrs(self._Lm, np.asfortranarray(tmp.T), lower=1) + self._A = tdot(tmp) # factor B - self.B = np.eye(self.num_inducing) + self.A + self.B = np.eye(self.num_inducing) + self._A self.LB = jitchol(self.B) - #VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! + # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! self.psi1Vf = np.dot(self.psi1.T, self.likelihood.VVT_factor) # back substutue C into psi1Vf - tmp, info1 = dtrtrs(self.Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0) + tmp, info1 = dtrtrs(self._Lm, np.asfortranarray(self.psi1Vf), lower=1, trans=0) self._LBi_Lmi_psi1Vf, _ = dtrtrs(self.LB, np.asfortranarray(tmp), lower=1, trans=0) - tmp, info2 = dpotrs(self.LB, tmp, lower=1) - self.Cpsi1Vf, info3 = dtrtrs(self.Lm, tmp, lower=1, trans=1) + # tmp, info2 = dpotrs(self.LB, tmp, lower=1) + tmp, info2 = dtrtrs(self.LB, self._LBi_Lmi_psi1Vf, lower=1, trans=1) + self.Cpsi1Vf, info3 = dtrtrs(self._Lm, tmp, lower=1, trans=1) # Compute dL_dKmm tmp = tdot(self._LBi_Lmi_psi1Vf) @@ -106,12 +131,12 @@ class SparseGP(GPBase): tmp = -0.5 * self.DBi_plus_BiPBi tmp += -0.5 * self.B * self.output_dim tmp += self.output_dim * np.eye(self.num_inducing) - self.dL_dKmm = backsub_both_sides(self.Lm, tmp) + self.dL_dKmm = backsub_both_sides(self._Lm, tmp) # Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertain inputs case self.dL_dpsi0 = -0.5 * self.output_dim * (self.likelihood.precision * np.ones([self.num_data, 1])).flatten() self.dL_dpsi1 = np.dot(self.likelihood.VVT_factor, self.Cpsi1Vf.T) - dL_dpsi2_beta = 0.5 * backsub_both_sides(self.Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi) + dL_dpsi2_beta = 0.5 * backsub_both_sides(self._Lm, self.output_dim * np.eye(self.num_inducing) - self.DBi_plus_BiPBi) if self.likelihood.is_heteroscedastic: if self.has_uncertain_inputs: @@ -139,17 +164,17 @@ class SparseGP(GPBase): else: # likelihood is not heterscedatic self.partial_for_likelihood = -0.5 * self.num_data * self.output_dim * self.likelihood.precision + 0.5 * self.likelihood.trYYT * self.likelihood.precision ** 2 - self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self.A) * self.likelihood.precision) - self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self.A * self.DBi_plus_BiPBi) - self.data_fit) + self.partial_for_likelihood += 0.5 * self.output_dim * (self.psi0.sum() * self.likelihood.precision ** 2 - np.trace(self._A) * self.likelihood.precision) + self.partial_for_likelihood += self.likelihood.precision * (0.5 * np.sum(self._A * self.DBi_plus_BiPBi) - self.data_fit) def log_likelihood(self): """ Compute the (lower bound on the) log marginal likelihood """ if self.likelihood.is_heteroscedastic: - A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V*self.likelihood.Y) - B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self.A)) + A = -0.5 * self.num_data * self.output_dim * np.log(2.*np.pi) + 0.5 * np.sum(np.log(self.likelihood.precision)) - 0.5 * np.sum(self.likelihood.V * self.likelihood.Y) + B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision.flatten() * self.psi0) - np.trace(self._A)) else: A = -0.5 * self.num_data * self.output_dim * (np.log(2.*np.pi) - np.log(self.likelihood.precision)) - 0.5 * self.likelihood.precision * self.likelihood.trYYT - B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self.A)) + B = -0.5 * self.output_dim * (np.sum(self.likelihood.precision * self.psi0) - np.trace(self._A)) C = -self.output_dim * (np.sum(np.log(np.diag(self.LB)))) # + 0.5 * self.num_inducing * np.log(sf2)) D = 0.5 * self.data_fit return A + B + C + D + self.likelihood.Z @@ -166,9 +191,12 @@ class SparseGP(GPBase): return np.hstack([self.Z.flatten(), self.kern._get_params_transformed(), self.likelihood._get_params()]) def _get_param_names(self): - return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])],[])\ + return sum([['iip_%i_%i' % (i, j) for j in range(self.Z.shape[1])] for i in range(self.Z.shape[0])], [])\ + self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + def _get_print_names(self): + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + def update_likelihood_approximation(self): """ Approximates a non-gaussian likelihood using Expectation Propagation @@ -179,7 +207,7 @@ class SparseGP(GPBase): if not isinstance(self.likelihood, Gaussian): # Updates not needed for Gaussian likelihood self.likelihood.restart() if self.has_uncertain_inputs: - Lmi = chol_inv(self.Lm) + Lmi = chol_inv(self._Lm) Kmmi = tdot(Lmi.T) diag_tr_psi2Kmmi = np.array([np.trace(psi2_Kmmi) for psi2_Kmmi in np.dot(self.psi2, Kmmi)]) @@ -221,19 +249,20 @@ class SparseGP(GPBase): return dL_dZ def _raw_predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): - """Internal helper function for making predictions, does not account for normalization""" + """ + Internal helper function for making predictions, does not account for + normalization or likelihood function + """ - Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work! + Bi, _ = dpotri(self.LB, lower=0) # WTH? this lower switch should be 1, but that doesn't work! symmetrify(Bi) - Kmmi_LmiBLmi = backsub_both_sides(self.Lm, np.eye(self.num_inducing) - Bi) + Kmmi_LmiBLmi = backsub_both_sides(self._Lm, np.eye(self.num_inducing) - Bi) if self.Cpsi1V is None: - psi1V = np.dot(self.psi1.T,self.likelihood.V) - tmp, _ = dtrtrs(self.Lm, np.asfortranarray(psi1V), lower=1, trans=0) + psi1V = np.dot(self.psi1.T, self.likelihood.V) + tmp, _ = dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0) tmp, _ = dpotrs(self.LB, tmp, lower=1) - self.Cpsi1V, _ = dtrtrs(self.Lm, tmp, lower=1, trans=1) - - + self.Cpsi1V, _ = dtrtrs(self._Lm, tmp, lower=1, trans=1) if X_variance_new is None: Kx = self.kern.K(self.Z, Xnew, which_parts=which_parts) @@ -294,17 +323,19 @@ class SparseGP(GPBase): return mean, var, _025pm, _975pm def plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, fignum=None, ax=None): + if ax is None: fig = pb.figure(num=fignum) ax = fig.add_subplot(111) - if which_data is 'all': which_data = slice(None) GPBase.plot(self, samples=0, plot_limits=None, which_data='all', which_parts='all', resolution=None, levels=20, ax=ax) + + # add the inducing inputs and some errorbars if self.X.shape[1] == 1: if self.has_uncertain_inputs: - Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now + Xu = self.X * self._Xscale + self._Xoffset # NOTE self.X are the normalized values now ax.errorbar(Xu[which_data, 0], self.likelihood.data[which_data, 0], xerr=2 * np.sqrt(self.X_variance[which_data, 0]), ecolor='k', fmt=None, elinewidth=.5, alpha=.5) diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py index 1db0e26f..b0175a39 100644 --- a/GPy/core/svigp.py +++ b/GPy/core/svigp.py @@ -91,6 +91,58 @@ class SVIGP(GPBase): self._param_steplength_trace = [] self._vb_steplength_trace = [] + def getstate(self): + steplength_params = [self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength] + return GPBase.getstate(self) + \ + [self.get_vb_param(), + self.Z, + self.num_inducing, + self.has_uncertain_inputs, + self.X_variance, + self.X_batch, + self.X_variance_batch, + steplength_params, + self.batchcounter, + self.batchsize, + self.epochs, + self.momentum, + self.data_prop, + self._param_trace, + self._param_steplength_trace, + self._vb_steplength_trace, + self._ll_trace, + self._grad_trace, + self.Y, + self._permutation, + self.iterations + ] + + def setstate(self, state): + self.iterations = state.pop() + self._permutation = state.pop() + self.Y = state.pop() + self._grad_trace = state.pop() + self._ll_trace = state.pop() + self._vb_steplength_trace = state.pop() + self._param_steplength_trace = state.pop() + self._param_trace = state.pop() + self.data_prop = state.pop() + self.momentum = state.pop() + self.epochs = state.pop() + self.batchsize = state.pop() + self.batchcounter = state.pop() + steplength_params = state.pop() + (self.hbar_t, self.tau_t, self.gbar_t, self.gbar_t1, self.gbar_t2, self.hbar_tp, self.tau_tp, self.gbar_tp, self.adapt_param_steplength, self.adapt_vb_steplength, self.vb_steplength, self.param_steplength) = steplength_params + self.X_variance_batch = state.pop() + self.X_batch = state.pop() + self.X_variance = state.pop() + self.has_uncertain_inputs = state.pop() + self.num_inducing = state.pop() + self.Z = state.pop() + vb_param = state.pop() + GPBase.setstate(self, state) + self.set_vb_param(vb_param) + def _compute_kernel_matrices(self): # kernel computations, using BGPLVM notation self.Kmm = self.kern.K(self.Z) @@ -166,7 +218,7 @@ class SVIGP(GPBase): psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0) else: psi2_beta = self.psi2.sum(0) * self.likelihood.precision - evals, evecs = linalg.eigh(psi2_beta) + evals, evecs = np.linalg.eigh(psi2_beta) clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable tmp = evecs * np.sqrt(clipped_evals) else: @@ -449,7 +501,7 @@ class SVIGP(GPBase): ax.plot(Zu, np.zeros_like(Zu) + Z_height, 'r|', mew=1.5, markersize=12) if self.input_dim==2: - ax.scatter(self.X_all[:,0], self.X_all[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet) + ax.scatter(self.X[:,0], self.X[:,1], 20., self.Y[:,0], linewidth=0, cmap=pb.cm.jet) ax.plot(Zu[:,0], Zu[:,1], 'w^') def plot_traces(self): diff --git a/GPy/core/transformations.py b/GPy/core/transformations.py index 2520a33b..419bc54e 100644 --- a/GPy/core/transformations.py +++ b/GPy/core/transformations.py @@ -4,6 +4,8 @@ import numpy as np from GPy.core.domains import POSITIVE, NEGATIVE, BOUNDED +import sys +lim_val = -np.log(sys.float_info.epsilon) class transformation(object): domain = None @@ -17,7 +19,7 @@ class transformation(object): """ df_dx evaluated at self.f(x)=f""" raise NotImplementedError def initialize(self, f): - """ produce a sensible initial values for f(x)""" + """ produce a sensible initial value for f(x)""" raise NotImplementedError def __str__(self): raise NotImplementedError @@ -25,18 +27,34 @@ class transformation(object): class logexp(transformation): domain = POSITIVE def f(self, x): - return np.log(1. + np.exp(x)) + return np.where(x>lim_val, x, np.log(1. + np.exp(x))) def finv(self, f): - return np.log(np.exp(f) - 1.) + return np.where(f>lim_val, f, np.log(np.exp(f) - 1.)) def gradfactor(self, f): - ef = np.exp(f) - return (ef - 1.) / ef + return np.where(f>lim_val, 1., 1 - np.exp(-f)) def initialize(self, f): + if np.any(f < 0.): + print "Warning: changing parameters to satisfy constraints" return np.abs(f) def __str__(self): return '(+ve)' -class logexp_clipped(transformation): +class negative_logexp(transformation): + domain = NEGATIVE + def f(self, x): + return -logexp.f(x) #np.log(1. + np.exp(x)) + def finv(self, f): + return logexp.finv(-f) #np.log(np.exp(-f) - 1.) + def gradfactor(self, f): + return -logexp.gradfactor(-f) + #ef = np.exp(-f) + #return -(ef - 1.) / ef + def initialize(self, f): + return -logexp.initialize(f) #np.abs(f) + def __str__(self): + return '(-ve)' + +class logexp_clipped(logexp): max_bound = 1e100 min_bound = 1e-10 log_max_bound = np.log(max_bound) @@ -64,9 +82,10 @@ class logexp_clipped(transformation): return '(+ve_c)' class exponent(transformation): + # TODO: can't allow this to go to zero, need to set a lower bound. Similar with negative exponent below. See old MATLAB code. domain = POSITIVE def f(self, x): - return np.exp(x) + return np.where(x-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val)) def finv(self, x): return np.log(x) def gradfactor(self, f): @@ -78,18 +97,16 @@ class exponent(transformation): def __str__(self): return '(+ve)' -class negative_exponent(transformation): +class negative_exponent(exponent): domain = NEGATIVE def f(self, x): - return -np.exp(x) - def finv(self, x): - return np.log(-x) + return -exponent.f(x) + def finv(self, f): + return exponent.finv(-f) def gradfactor(self, f): return f def initialize(self, f): - if np.any(f > 0.): - print "Warning: changing parameters to satisfy constraints" - return -np.abs(f) + return -exponent.initialize(f) #np.abs(f) def __str__(self): return '(-ve)' diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index c7daa26b..77d1982c 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -10,7 +10,7 @@ import numpy as np import GPy default_seed = 10000 -def crescent_data(seed=default_seed): # FIXME +def crescent_data(seed=default_seed, kernel=None): # FIXME """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. @@ -32,33 +32,33 @@ def crescent_data(seed=default_seed): # FIXME m.plot() return m -def oil(num_inducing=50): +def oil(num_inducing=50, max_iters=100, kernel=None): """ - Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. + Run a Gaussian process classification on the three phase oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. """ data = GPy.util.datasets.oil() - X = data['X'][:600,:] - X_test = data['X'][600:,:] - Y = data['Y'][:600, 0:1] + X = data['X'] + Xtest = data['Xtest'] + Y = data['Y'][:, 0:1] + Ytest = data['Ytest'][:, 0:1] Y[Y.flatten()==-1] = 0 - Y_test = data['Y'][600:, 0:1] + Ytest[Ytest.flatten()==-1] = 0 # Create GP model - m = GPy.models.SparseGPClassification(X, Y,num_inducing=num_inducing) + m = GPy.models.SparseGPClassification(X, Y,kernel=kernel,num_inducing=num_inducing) # Contrain all parameters to be positive - m.constrain_positive('') m.tie_params('.*len') m['.*len'] = 10. m.update_likelihood_approximation() # Optimize - m.optimize() + m.optimize(max_iters=max_iters) print(m) #Test - probs = m.predict(X_test)[0] - GPy.util.classification.conf_matrix(probs,Y_test) + probs = m.predict(Xtest)[0] + GPy.util.classification.conf_matrix(probs,Ytest) return m def toy_linear_1d_classification(seed=default_seed): @@ -118,7 +118,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10,seed=default_seed): return m -def sparse_crescent_data(num_inducing=10, seed=default_seed): +def sparse_crescent_data(num_inducing=10, seed=default_seed, kernel=None): """ Run a Gaussian process classification with DTC approxiamtion on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. @@ -133,7 +133,7 @@ def sparse_crescent_data(num_inducing=10, seed=default_seed): Y = data['Y'] Y[Y.flatten()==-1]=0 - m = GPy.models.SparseGPClassification(data['X'], Y,num_inducing=num_inducing) + m = GPy.models.SparseGPClassification(data['X'], Y, kernel=kernel, num_inducing=num_inducing) m['.*len'] = 10. #m.update_likelihood_approximation() #m.optimize() diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index 8f6fdbe7..005b131f 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -7,26 +7,30 @@ from matplotlib import pyplot as plt, cm import GPy from GPy.core.transformations import logexp from GPy.models.bayesian_gplvm import BayesianGPLVM +from GPy.likelihoods.gaussian import Gaussian default_seed = np.random.seed(123344) def BGPLVM(seed=default_seed): - N = 10 - num_inducing = 3 - Q = 2 - D = 4 + N = 5 + num_inducing = 4 + Q = 3 + D = 2 # generate GPLVM-like data X = np.random.rand(N, Q) - k = GPy.kern.rbf(Q) + GPy.kern.white(Q, 0.00001) + lengthscales = np.random.rand(Q) + k = (GPy.kern.rbf(Q, .5, lengthscales, ARD=True) + + GPy.kern.white(Q, 0.01)) K = k.K(X) - Y = np.random.multivariate_normal(np.zeros(N), K, Q).T + Y = np.random.multivariate_normal(np.zeros(N), K, D).T + lik = Gaussian(Y, normalize=True) - k = GPy.kern.rbf(Q, ARD=True) + GPy.kern.linear(Q, ARD=True) + GPy.kern.rbf(Q, ARD=True) + GPy.kern.white(Q) - # k = GPy.kern.rbf(Q) + GPy.kern.rbf(Q) + GPy.kern.white(Q) + k = GPy.kern.rbf_inv(Q, .5, np.ones(Q) * 2., ARD=True) + GPy.kern.bias(Q) + GPy.kern.white(Q) # k = GPy.kern.rbf(Q) + GPy.kern.bias(Q) + GPy.kern.white(Q, 0.00001) # k = GPy.kern.rbf(Q, ARD = False) + GPy.kern.white(Q, 0.00001) - m = GPy.models.BayesianGPLVM(Y, Q, kernel=k, num_inducing=num_inducing) + m = GPy.models.BayesianGPLVM(lik, Q, kernel=k, num_inducing=num_inducing) + m.lengthscales = lengthscales # m.constrain_positive('(rbf|bias|noise|white|S)') # m.constrain_fixed('S', 1) @@ -37,8 +41,8 @@ def BGPLVM(seed=default_seed): # m.optimize(messages = 1) # m.plot() # pb.title('After optimisation') - m.randomize() - m.checkgrad(verbose=1) + # m.randomize() + # m.checkgrad(verbose=1) return m @@ -60,6 +64,28 @@ def GPLVM_oil_100(optimize=True): m.plot_latent(labels=m.data_labels) return m +def sparseGPLVM_oil(optimize=True, N=100, Q=6, num_inducing=15, max_iters=50): + np.random.seed(0) + data = GPy.util.datasets.oil() + + Y = data['X'][:N] + Y = Y - Y.mean(0) + Y /= Y.std(0) + + # create simple GP model + kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q) + m = GPy.models.SparseGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing) + m.data_labels = data['Y'].argmax(axis=1) + + # optimize + if optimize: + m.optimize('scg', messages=1, max_iters=max_iters) + + # plot + print(m) + # m.plot_latent(labels=m.data_labels) + return m + def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False): from GPy.util.datasets import swiss_roll_generated from GPy.core.transformations import logexp_clipped @@ -114,30 +140,33 @@ def swiss_roll(optimize=True, N=1000, num_inducing=15, Q=4, sigma=.2, plot=False m.optimize('scg', messages=1) return m -def BGPLVM_oil(optimize=True, N=200, Q=10, num_inducing=15, max_f_eval=50, plot=False, **k): +def BGPLVM_oil(optimize=True, N=200, Q=7, num_inducing=40, max_iters=1000, plot=False, **k): np.random.seed(0) data = GPy.util.datasets.oil() # create simple GP model - kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2)) + kernel = GPy.kern.rbf_inv(Q, 1., [.1] * Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + Y = data['X'][:N] - Yn = Y - Y.mean(0) - Yn /= Yn.std(0) + Yn = Gaussian(Y, normalize=True) +# Yn = Y - Y.mean(0) +# Yn /= Yn.std(0) m = GPy.models.BayesianGPLVM(Yn, Q, kernel=kernel, num_inducing=num_inducing, **k) m.data_labels = data['Y'][:N].argmax(axis=1) # m.constrain('variance|leng', logexp_clipped()) - m['.*lengt'] = 1. # m.X.var(0).max() / m.X.var(0) - m['noise'] = Yn.var() / 100. + # m['.*lengt'] = m.X.var(0).max() / m.X.var(0) + m['noise'] = Yn.Y.var() / 100. # optimize if optimize: m.constrain_fixed('noise') - m.optimize('scg', messages=1, max_f_eval=100, gtol=.05) + m.optimize('scg', messages=1, max_iters=200, gtol=.05) m.constrain_positive('noise') - m.optimize('scg', messages=1, max_f_eval=max_f_eval, gtol=.05) + m.constrain_bounded('white', 1e-7, 1) + m.optimize('scg', messages=1, max_iters=max_iters, gtol=.05) if plot: y = m.likelihood.Y[0, :] @@ -186,7 +215,7 @@ def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False): Y1 += .3 * np.random.randn(*Y1.shape) Y2 += .2 * np.random.randn(*Y2.shape) - Y3 += .1 * np.random.randn(*Y3.shape) + Y3 += .25 * np.random.randn(*Y3.shape) Y1 -= Y1.mean(0) Y2 -= Y2.mean(0) @@ -241,29 +270,27 @@ def bgplvm_simulation_matlab_compare(): def bgplvm_simulation(optimize='scg', plot=True, - max_f_eval=2e4): + max_iters=2e4, + plot_sim=False): # from GPy.core.transformations import logexp_clipped - D1, D2, D3, N, num_inducing, Q = 15, 8, 8, 100, 3, 5 - slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot) + D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10 + slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) from GPy.models import mrd from GPy import kern reload(mrd); reload(kern) - Y = Ylist[0] k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) # + kern.bias(Q) - m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k, _debug=True) + m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k) # m.constrain('variance|noise', logexp_clipped()) m['noise'] = Y.var() / 100. - m['linear_variance'] = .01 if optimize: print "Optimizing model:" - m.optimize(optimize, max_iters=max_f_eval, - max_f_eval=max_f_eval, + m.optimize(optimize, max_iters=max_iters, messages=True, gtol=.05) if plot: m.plot_X_1d("BGPLVM Latent Space 1D") @@ -271,19 +298,22 @@ def bgplvm_simulation(optimize='scg', return m def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw): - D1, D2, D3, N, num_inducing, Q = 150, 200, 400, 500, 3, 7 + D1, D2, D3, N, num_inducing, Q = 60, 20, 36, 60, 6, 5 slist, Slist, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) + likelihood_list = [Gaussian(x, normalize=True) for x in Ylist] + from GPy.models import mrd from GPy import kern reload(mrd); reload(kern) - k = kern.linear(Q, [.05] * Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) - m = mrd.MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw) + k = kern.linear(Q, ARD=True) + kern.bias(Q, np.exp(-2)) + kern.white(Q, np.exp(-2)) + m = mrd.MRD(likelihood_list, input_dim=Q, num_inducing=num_inducing, kernels=k, initx="", initz='permute', **kw) + m.ensure_default_constraints() - for i, Y in enumerate(Ylist): - m['{}_noise'.format(i + 1)] = Y.var() / 100. + for i, bgplvm in enumerate(m.bgplvms): + m['{}_noise'.format(i)] = bgplvm.likelihood.Y.var() / 500. # DEBUG @@ -291,7 +321,7 @@ def mrd_simulation(optimize=True, plot=True, plot_sim=True, **kw): if optimize: print "Optimizing Model:" - m.optimize(messages=1, max_iters=8e3, max_f_eval=8e3, gtol=.1) + m.optimize(messages=1, max_iters=8e3, gtol=.1) if plot: m.plot_X_1d("MRD Latent Space 1D") m.plot_scales("MRD Scales") @@ -322,9 +352,9 @@ def brendan_faces(): return m def stick_play(range=None, frame_rate=15): - data = GPy.util.datasets.stick() + data = GPy.util.datasets.osu_run1() # optimize - if range==None: + if range == None: Y = data['Y'].copy() else: Y = data['Y'][range[0]:range[1], :].copy() @@ -333,29 +363,73 @@ def stick_play(range=None, frame_rate=15): GPy.util.visualize.data_play(Y, data_show, frame_rate) return Y -def stick(): - data = GPy.util.datasets.stick() +def stick(kernel=None): + data = GPy.util.datasets.osu_run1() + # optimize + m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel) + m.optimize(messages=1, max_f_eval=10000) + if GPy.util.visualize.visual_available: + plt.clf + ax = m.plot_latent() + y = m.likelihood.Y[0, :] + data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) + lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + raw_input('Press enter to finish') + + return m + +def bcgplvm_linear_stick(kernel=None): + data = GPy.util.datasets.osu_run1() + # optimize + mapping = GPy.mappings.Linear(data['Y'].shape[1], 2) + m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping) + m.optimize(messages=1, max_f_eval=10000) + if GPy.util.visualize.visual_available: + plt.clf + ax = m.plot_latent() + y = m.likelihood.Y[0, :] + data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) + lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + raw_input('Press enter to finish') + + return m + +def bcgplvm_stick(kernel=None): + data = GPy.util.datasets.osu_run1() + # optimize + back_kernel=GPy.kern.rbf(data['Y'].shape[1], lengthscale=5.) + mapping = GPy.mappings.Kernel(X=data['Y'], output_dim=2, kernel=back_kernel) + m = GPy.models.BCGPLVM(data['Y'], 2, kernel=kernel, mapping=mapping) + m.optimize(messages=1, max_f_eval=10000) + if GPy.util.visualize.visual_available: + plt.clf + ax = m.plot_latent() + y = m.likelihood.Y[0, :] + data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) + lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + raw_input('Press enter to finish') + + return m + +def robot_wireless(): + data = GPy.util.datasets.robot_wireless() # optimize m = GPy.models.GPLVM(data['Y'], 2) m.optimize(messages=1, max_f_eval=10000) m._set_params(m._get_params()) plt.clf ax = m.plot_latent() - y = m.likelihood.Y[0, :] - data_show = GPy.util.visualize.stick_show(y[None, :], connect=data['connect']) - lvm_visualizer = GPy.util.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) - raw_input('Press enter to finish') return m def stick_bgplvm(model=None): - data = GPy.util.datasets.stick() + data = GPy.util.datasets.osu_run1() Q = 6 kernel = GPy.kern.rbf(Q, ARD=True) + GPy.kern.bias(Q, np.exp(-2)) + GPy.kern.white(Q, np.exp(-2)) - m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20,kernel=kernel) + m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel) # optimize m.ensure_default_constraints() - m.optimize(messages=1, max_f_eval=3000,xtol=1e-300,ftol=1e-300) + m.optimize('scg', messages=1, max_iters=200, xtol=1e-300, ftol=1e-300) m._set_params(m._get_params()) plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2) plt.sca(latent_axes) diff --git a/GPy/examples/regression.py b/GPy/examples/regression.py index 21b435e7..c7849a46 100644 --- a/GPy/examples/regression.py +++ b/GPy/examples/regression.py @@ -9,181 +9,156 @@ import pylab as pb import numpy as np import GPy - -def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100): - """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance.""" - data = GPy.util.datasets.toy_rbf_1d() - - # create simple GP Model - m = GPy.models.GPRegression(data['X'],data['Y']) - - # optimize - m.optimize(optimizer, max_f_eval=max_nb_eval_optim) - # plot - m.plot() - print(m) - return m - -def rogers_girolami_olympics(optim_iters=100): - """Run a standard Gaussian process regression on the Rogers and Girolami olympics data.""" - data = GPy.util.datasets.rogers_girolami_olympics() - - # create simple GP Model - m = GPy.models.GPRegression(data['X'],data['Y']) - - #set the lengthscale to be something sensible (defaults to 1) - m['rbf_lengthscale'] = 10 - - # optimize - m.optimize(max_f_eval=optim_iters) - - # plot - m.plot(plot_limits = (1850, 2050)) - print(m) - return m - -def toy_rbf_1d_50(optim_iters=100): - """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance.""" - data = GPy.util.datasets.toy_rbf_1d_50() - - # create simple GP Model - m = GPy.models.GPRegression(data['X'],data['Y']) - - # optimize - m.optimize(max_f_eval=optim_iters) - - # plot - m.plot() - print(m) - return m - -def silhouette(optim_iters=100): - """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper.""" - data = GPy.util.datasets.silhouette() - - # create simple GP Model - m = GPy.models.GPRegression(data['X'],data['Y']) - - # optimize - m.optimize(messages=True,max_f_eval=optim_iters) - - print(m) - return m - -def coregionalisation_toy2(optim_iters=100): +def coregionalisation_toy2(max_iters=100): """ A simple demonstration of coregionalisation on two sinusoidal functions. """ - X1 = np.random.rand(50,1)*8 - X2 = np.random.rand(30,1)*5 - index = np.vstack((np.zeros_like(X1),np.ones_like(X2))) - X = np.hstack((np.vstack((X1,X2)),index)) - Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05 - Y2 = np.sin(X2) + np.random.randn(*X2.shape)*0.05 + 2. - Y = np.vstack((Y1,Y2)) + X1 = np.random.rand(50, 1) * 8 + X2 = np.random.rand(30, 1) * 5 + index = np.vstack((np.zeros_like(X1), np.ones_like(X2))) + X = np.hstack((np.vstack((X1, X2)), index)) + Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05 + Y2 = np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + 2. + Y = np.vstack((Y1, Y2)) k1 = GPy.kern.rbf(1) + GPy.kern.bias(1) - k2 = GPy.kern.Coregionalise(2,1) - k = k1.prod(k2,tensor=True) - m = GPy.models.GPRegression(X,Y,kernel=k) - m.constrain_fixed('.*rbf_var',1.) - #m.constrain_positive('.*kappa') - m.optimize('sim',messages=1,max_f_eval=optim_iters) + k2 = GPy.kern.coregionalise(2, 1) + k = k1**k2 + m = GPy.models.GPRegression(X, Y, kernel=k) + m.constrain_fixed('.*rbf_var', 1.) + # m.constrain_positive('.*kappa') + m.optimize('sim', messages=1, max_iters=max_iters) pb.figure() - Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1)))) - Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1)))) - mean, var,low,up = m.predict(Xtest1) - GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up) - mean, var,low,up = m.predict(Xtest2) - GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up) - pb.plot(X1[:,0],Y1[:,0],'rx',mew=2) - pb.plot(X2[:,0],Y2[:,0],'gx',mew=2) + Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1)))) + Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1)))) + mean, var, low, up = m.predict(Xtest1) + GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up) + mean, var, low, up = m.predict(Xtest2) + GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up) + pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2) + pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2) return m -def coregionalisation_toy(optim_iters=100): +def coregionalisation_toy(max_iters=100): """ A simple demonstration of coregionalisation on two sinusoidal functions. """ - X1 = np.random.rand(50,1)*8 - X2 = np.random.rand(30,1)*5 - index = np.vstack((np.zeros_like(X1),np.ones_like(X2))) - X = np.hstack((np.vstack((X1,X2)),index)) - Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05 - Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05 - Y = np.vstack((Y1,Y2)) + X1 = np.random.rand(50, 1) * 8 + X2 = np.random.rand(30, 1) * 5 + index = np.vstack((np.zeros_like(X1), np.ones_like(X2))) + X = np.hstack((np.vstack((X1, X2)), index)) + Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05 + Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + Y = np.vstack((Y1, Y2)) k1 = GPy.kern.rbf(1) - k2 = GPy.kern.Coregionalise(2,2) - k = k1.prod(k2,tensor=True) - m = GPy.models.GPRegression(X,Y,kernel=k) - m.constrain_fixed('.*rbf_var',1.) - #m.constrain_positive('kappa') - m.optimize(max_f_eval=optim_iters) + k2 = GPy.kern.coregionalise(2, 2) + k = k1**k2 #k1.prod(k2, tensor=True) + m = GPy.models.GPRegression(X, Y, kernel=k) + m.constrain_fixed('.*rbf_var', 1.) + # m.constrain_positive('kappa') + m.optimize(max_iters=max_iters) pb.figure() - Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1)))) - Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1)))) - mean, var,low,up = m.predict(Xtest1) - GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up) - mean, var,low,up = m.predict(Xtest2) - GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up) - pb.plot(X1[:,0],Y1[:,0],'rx',mew=2) - pb.plot(X2[:,0],Y2[:,0],'gx',mew=2) + Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1)))) + Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1)))) + mean, var, low, up = m.predict(Xtest1) + GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up) + mean, var, low, up = m.predict(Xtest2) + GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up) + pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2) + pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2) return m -def coregionalisation_sparse(optim_iters=100): +def coregionalisation_sparse(max_iters=100): """ A simple demonstration of coregionalisation on two sinusoidal functions using sparse approximations. """ - X1 = np.random.rand(500,1)*8 - X2 = np.random.rand(300,1)*5 - index = np.vstack((np.zeros_like(X1),np.ones_like(X2))) - X = np.hstack((np.vstack((X1,X2)),index)) - Y1 = np.sin(X1) + np.random.randn(*X1.shape)*0.05 - Y2 = -np.sin(X2) + np.random.randn(*X2.shape)*0.05 - Y = np.vstack((Y1,Y2)) + X1 = np.random.rand(500, 1) * 8 + X2 = np.random.rand(300, 1) * 5 + index = np.vstack((np.zeros_like(X1), np.ones_like(X2))) + X = np.hstack((np.vstack((X1, X2)), index)) + Y1 = np.sin(X1) + np.random.randn(*X1.shape) * 0.05 + Y2 = -np.sin(X2) + np.random.randn(*X2.shape) * 0.05 + Y = np.vstack((Y1, Y2)) num_inducing = 40 - Z = np.hstack((np.random.rand(num_inducing,1)*8,np.random.randint(0,2,num_inducing)[:,None])) + Z = np.hstack((np.random.rand(num_inducing, 1) * 8, np.random.randint(0, 2, num_inducing)[:, None])) k1 = GPy.kern.rbf(1) - k2 = GPy.kern.Coregionalise(2,2) - k = k1.prod(k2,tensor=True) + GPy.kern.white(2,0.001) + k2 = GPy.kern.coregionalise(2, 2) + k = k1**k2 #.prod(k2, tensor=True) # + GPy.kern.white(2,0.001) - m = GPy.models.SparseGPRegression(X,Y,kernel=k,Z=Z) - m.constrain_fixed('.*rbf_var',1.) + m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z) + m.constrain_fixed('.*rbf_var', 1.) m.constrain_fixed('iip') - m.constrain_bounded('noise_variance',1e-3,1e-1) - m.optimize_restarts(5, robust=True, messages=1, max_f_eval=optim_iters) + m.constrain_bounded('noise_variance', 1e-3, 1e-1) +# m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs') + m.optimize(max_iters=max_iters) - #plotting: + # plotting: pb.figure() - Xtest1 = np.hstack((np.linspace(0,9,100)[:,None],np.zeros((100,1)))) - Xtest2 = np.hstack((np.linspace(0,9,100)[:,None],np.ones((100,1)))) - mean, var,low,up = m.predict(Xtest1) - GPy.util.plot.gpplot(Xtest1[:,0],mean,low,up) - mean, var,low,up = m.predict(Xtest2) - GPy.util.plot.gpplot(Xtest2[:,0],mean,low,up) - pb.plot(X1[:,0],Y1[:,0],'rx',mew=2) - pb.plot(X2[:,0],Y2[:,0],'gx',mew=2) + Xtest1 = np.hstack((np.linspace(0, 9, 100)[:, None], np.zeros((100, 1)))) + Xtest2 = np.hstack((np.linspace(0, 9, 100)[:, None], np.ones((100, 1)))) + mean, var, low, up = m.predict(Xtest1) + GPy.util.plot.gpplot(Xtest1[:, 0], mean, low, up) + mean, var, low, up = m.predict(Xtest2) + GPy.util.plot.gpplot(Xtest2[:, 0], mean, low, up) + pb.plot(X1[:, 0], Y1[:, 0], 'rx', mew=2) + pb.plot(X2[:, 0], Y2[:, 0], 'gx', mew=2) y = pb.ylim()[0] - pb.plot(Z[:,0][Z[:,1]==0],np.zeros(np.sum(Z[:,1]==0))+y,'r|',mew=2) - pb.plot(Z[:,0][Z[:,1]==1],np.zeros(np.sum(Z[:,1]==1))+y,'g|',mew=2) + pb.plot(Z[:, 0][Z[:, 1] == 0], np.zeros(np.sum(Z[:, 1] == 0)) + y, 'r|', mew=2) + pb.plot(Z[:, 0][Z[:, 1] == 1], np.zeros(np.sum(Z[:, 1] == 1)) + y, 'g|', mew=2) + return m + +def epomeo_gpx(max_iters=100): + """Perform Gaussian process regression on the latitude and longitude data from the Mount Epomeo runs. Requires gpxpy to be installed on your system to load in the data.""" + data = GPy.util.datasets.epomeo_gpx() + num_data_list = [] + for Xpart in data['X']: + num_data_list.append(Xpart.shape[0]) + + num_data_array = np.array(num_data_list) + num_data = num_data_array.sum() + Y = np.zeros((num_data, 2)) + t = np.zeros((num_data, 2)) + start = 0 + for Xpart, index in zip(data['X'], range(len(data['X']))): + end = start+Xpart.shape[0] + t[start:end, :] = np.hstack((Xpart[:, 0:1], + index*np.ones((Xpart.shape[0], 1)))) + Y[start:end, :] = Xpart[:, 1:3] + + num_inducing = 200 + Z = np.hstack((np.linspace(t[:,0].min(), t[:, 0].max(), num_inducing)[:, None], + np.random.randint(0, 4, num_inducing)[:, None])) + + k1 = GPy.kern.rbf(1) + k2 = GPy.kern.coregionalise(output_dim=5, rank=5) + k = k1**k2 + + m = GPy.models.SparseGPRegression(t, Y, kernel=k, Z=Z, normalize_Y=True) + m.constrain_fixed('.*rbf_var', 1.) + m.constrain_fixed('iip') + m.constrain_bounded('noise_variance', 1e-3, 1e-1) +# m.optimize_restarts(5, robust=True, messages=1, max_iters=max_iters, optimizer='bfgs') + m.optimize(max_iters=max_iters,messages=True) + return m -def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000, optim_iters=300): - """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisey mode is higher.""" +def multiple_optima(gene_number=937, resolution=80, model_restarts=10, seed=10000, max_iters=300): + """Show an example of a multimodal error surface for Gaussian process regression. Gene 939 has bimodal behaviour where the noisy mode is higher.""" # Contour over a range of length scales and signal/noise ratios. length_scales = np.linspace(0.1, 60., resolution) log_SNRs = np.linspace(-3., 4., resolution) data = GPy.util.datasets.della_gatta_TRP63_gene_expression(gene_number) - #data['Y'] = data['Y'][0::2, :] - #data['X'] = data['X'][0::2, :] + # data['Y'] = data['Y'][0::2, :] + # data['X'] = data['X'][0::2, :] data['Y'] = data['Y'] - np.mean(data['Y']) @@ -202,26 +177,26 @@ def multiple_optima(gene_number=937,resolution=80, model_restarts=10, seed=10000 optim_point_y = np.empty(2) np.random.seed(seed=seed) for i in range(0, model_restarts): - #kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)) - kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3,1), lengthscale=np.random.uniform(5,50)) + # kern = GPy.kern.rbf(1, variance=np.random.exponential(1.), lengthscale=np.random.exponential(50.)) + kern = GPy.kern.rbf(1, variance=np.random.uniform(1e-3, 1), lengthscale=np.random.uniform(5, 50)) - m = GPy.models.GPRegression(data['X'],data['Y'], kernel=kern) - m['noise_variance'] = np.random.uniform(1e-3,1) + m = GPy.models.GPRegression(data['X'], data['Y'], kernel=kern) + m['noise_variance'] = np.random.uniform(1e-3, 1) optim_point_x[0] = m['rbf_lengthscale'] optim_point_y[0] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']); # optimize - m.optimize('scg', xtol=1e-6, ftol=1e-6, max_f_eval=optim_iters) + m.optimize('scg', xtol=1e-6, ftol=1e-6, max_iters=max_iters) optim_point_x[1] = m['rbf_lengthscale'] optim_point_y[1] = np.log10(m['rbf_variance']) - np.log10(m['noise_variance']); - pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1]-optim_point_x[0], optim_point_y[1]-optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k') + pb.arrow(optim_point_x[0], optim_point_y[0], optim_point_x[1] - optim_point_x[0], optim_point_y[1] - optim_point_y[0], label=str(i), head_length=1, head_width=0.5, fc='k', ec='k') models.append(m) ax.set_xlim(xlim) ax.set_ylim(ylim) - return m #(models, lls) + return m # (models, lls) def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf): """Evaluate the GP objective function for a given data set for a range of signal to noise ratios and a range of lengthscales. @@ -234,88 +209,261 @@ def _contour_data(data, length_scales, log_SNRs, kernel_call=GPy.kern.rbf): lls = [] total_var = np.var(data['Y']) kernel = kernel_call(1, variance=1., lengthscale=1.) - Model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel) + model = GPy.models.GPRegression(data['X'], data['Y'], kernel=kernel) for log_SNR in log_SNRs: SNR = 10.**log_SNR - noise_var = total_var/(1.+SNR) + noise_var = total_var / (1. + SNR) signal_var = total_var - noise_var - Model.kern['.*variance'] = signal_var - Model['noise_variance'] = noise_var + model.kern['.*variance'] = signal_var + model['noise_variance'] = noise_var length_scale_lls = [] for length_scale in length_scales: - Model['.*lengthscale'] = length_scale - length_scale_lls.append(Model.log_likelihood()) + model['.*lengthscale'] = length_scale + length_scale_lls.append(model.log_likelihood()) lls.append(length_scale_lls) return np.array(lls) -def sparse_GP_regression_1D(N = 400, num_inducing = 5, optim_iters=100): - """Run a 1D example of a sparse GP regression.""" - # sample inputs and outputs - X = np.random.uniform(-3.,3.,(N,1)) - Y = np.sin(X)+np.random.randn(N,1)*0.05 - # construct kernel - rbf = GPy.kern.rbf(1) - noise = GPy.kern.white(1) - kernel = rbf + noise + +def olympic_100m_men(max_iters=100, kernel=None): + """Run a standard Gaussian process regression on the Rogers and Girolami olympics data.""" + data = GPy.util.datasets.olympic_100m_men() + # create simple GP Model - m = GPy.models.SparseGPRegression(X, Y, kernel, num_inducing=num_inducing) + m = GPy.models.GPRegression(data['X'], data['Y'], kernel) + # set the lengthscale to be something sensible (defaults to 1) + if kernel==None: + m['rbf_lengthscale'] = 10 - m.checkgrad(verbose=1) - m.optimize('tnc', messages = 1, max_f_eval=optim_iters) - m.plot() + # optimize + m.optimize(max_iters=max_iters) + + # plot + m.plot(plot_limits=(1850, 2050)) + print(m) return m -def sparse_GP_regression_2D(N = 400, num_inducing = 50, optim_iters=100): - """Run a 2D example of a sparse GP regression.""" - X = np.random.uniform(-3.,3.,(N,2)) - Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05 - - # construct kernel - rbf = GPy.kern.rbf(2) - noise = GPy.kern.white(2) - kernel = rbf + noise +def olympic_marathon_men(max_iters=100, kernel=None): + """Run a standard Gaussian process regression on the Olympic marathon data.""" + data = GPy.util.datasets.olympic_marathon_men() # create simple GP Model - m = GPy.models.SparseGPRegression(X,Y,kernel, num_inducing = num_inducing) + m = GPy.models.GPRegression(data['X'], data['Y'], kernel) - # contrain all parameters to be positive (but not inducing inputs) - m.set('.*len',2.) + # set the lengthscale to be something sensible (defaults to 1) + if kernel==None: + m['rbf_lengthscale'] = 10 - m.checkgrad() + # optimize + m.optimize(max_iters=max_iters) - # optimize and plot - m.optimize('tnc', messages = 1, max_f_eval=optim_iters) + # plot + m.plot(plot_limits=(1850, 2050)) + print(m) + return m + +def toy_rbf_1d(optimizer='tnc', max_nb_eval_optim=100): + """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance.""" + data = GPy.util.datasets.toy_rbf_1d() + + # create simple GP Model + m = GPy.models.GPRegression(data['X'], data['Y']) + + # optimize + m.optimize(optimizer, max_f_eval=max_nb_eval_optim) + # plot m.plot() print(m) return m -def uncertain_inputs_sparse_regression(optim_iters=100): +def toy_rbf_1d_50(max_iters=100): + """Run a simple demonstration of a standard Gaussian process fitting it to data sampled from an RBF covariance.""" + data = GPy.util.datasets.toy_rbf_1d_50() + + # create simple GP Model + m = GPy.models.GPRegression(data['X'], data['Y']) + + # optimize + m.optimize(max_iters=max_iters) + + # plot + m.plot() + print(m) + return m + +def toy_ARD(max_iters=1000, kernel_type='linear', num_samples=300, D=4): + # Create an artificial dataset where the values in the targets (Y) + # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to + # see if this dependency can be recovered + X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0)) + X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0)) + X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0)) + X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0)) + X = np.hstack((X1, X2, X3, X4)) + + Y1 = np.asarray(2 * X[:, 0] + 3).reshape(-1, 1) + Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0])).reshape(-1, 1) + Y = np.hstack((Y1, Y2)) + + Y = np.dot(Y, np.random.rand(2, D)); + Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1]) + Y -= Y.mean() + Y /= Y.std() + + if kernel_type == 'linear': + kernel = GPy.kern.linear(X.shape[1], ARD=1) + elif kernel_type == 'rbf_inv': + kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1) + else: + kernel = GPy.kern.rbf(X.shape[1], ARD=1) + kernel += GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1]) + m = GPy.models.GPRegression(X, Y, kernel) + # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25 + # m.set_prior('.*lengthscale',len_prior) + + m.optimize(optimizer='scg', max_iters=max_iters, messages=1) + + m.kern.plot_ARD() + print(m) + return m + +def toy_ARD_sparse(max_iters=1000, kernel_type='linear', num_samples=300, D=4): + # Create an artificial dataset where the values in the targets (Y) + # only depend in dimensions 1 and 3 of the inputs (X). Run ARD to + # see if this dependency can be recovered + X1 = np.sin(np.sort(np.random.rand(num_samples, 1) * 10, 0)) + X2 = np.cos(np.sort(np.random.rand(num_samples, 1) * 10, 0)) + X3 = np.exp(np.sort(np.random.rand(num_samples, 1), 0)) + X4 = np.log(np.sort(np.random.rand(num_samples, 1), 0)) + X = np.hstack((X1, X2, X3, X4)) + + Y1 = np.asarray(2 * X[:, 0] + 3)[:, None] + Y2 = np.asarray(4 * (X[:, 2] - 1.5 * X[:, 0]))[:, None] + Y = np.hstack((Y1, Y2)) + + Y = np.dot(Y, np.random.rand(2, D)); + Y = Y + 0.2 * np.random.randn(Y.shape[0], Y.shape[1]) + Y -= Y.mean() + Y /= Y.std() + + if kernel_type == 'linear': + kernel = GPy.kern.linear(X.shape[1], ARD=1) + elif kernel_type == 'rbf_inv': + kernel = GPy.kern.rbf_inv(X.shape[1], ARD=1) + else: + kernel = GPy.kern.rbf(X.shape[1], ARD=1) + kernel += GPy.kern.bias(X.shape[1]) + X_variance = np.ones(X.shape) * 0.5 + m = GPy.models.SparseGPRegression(X, Y, kernel, X_variance=X_variance) + # len_prior = GPy.priors.inverse_gamma(1,18) # 1, 25 + # m.set_prior('.*lengthscale',len_prior) + + m.optimize(optimizer='scg', max_iters=max_iters, messages=1) + + m.kern.plot_ARD() + print(m) + return m + +def robot_wireless(max_iters=100, kernel=None): + """Predict the location of a robot given wirelss signal strength readings.""" + data = GPy.util.datasets.robot_wireless() + + # create simple GP Model + m = GPy.models.GPRegression(data['Y'], data['X'], kernel=kernel) + + # optimize + m.optimize(messages=True, max_iters=max_iters) + Xpredict = m.predict(data['Ytest'])[0] + pb.plot(data['Xtest'][:, 0], data['Xtest'][:, 1], 'r-') + pb.plot(Xpredict[:, 0], Xpredict[:, 1], 'b-') + pb.axis('equal') + pb.title('WiFi Localization with Gaussian Processes') + pb.legend(('True Location', 'Predicted Location')) + + sse = ((data['Xtest'] - Xpredict)**2).sum() + print(m) + print('Sum of squares error on test data: ' + str(sse)) + return m + +def silhouette(max_iters=100): + """Predict the pose of a figure given a silhouette. This is a task from Agarwal and Triggs 2004 ICML paper.""" + data = GPy.util.datasets.silhouette() + + # create simple GP Model + m = GPy.models.GPRegression(data['X'], data['Y']) + + # optimize + m.optimize(messages=True, max_iters=max_iters) + + print(m) + return m + + + +def sparse_GP_regression_1D(num_samples=400, num_inducing=5, max_iters=100): + """Run a 1D example of a sparse GP regression.""" + # sample inputs and outputs + X = np.random.uniform(-3., 3., (num_samples, 1)) + Y = np.sin(X) + np.random.randn(num_samples, 1) * 0.05 + # construct kernel + rbf = GPy.kern.rbf(1) + # create simple GP Model + m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing) + + + m.checkgrad(verbose=1) + m.optimize('tnc', messages=1, max_iters=max_iters) + m.plot() + return m + +def sparse_GP_regression_2D(num_samples=400, num_inducing=50, max_iters=100): + """Run a 2D example of a sparse GP regression.""" + X = np.random.uniform(-3., 3., (num_samples, 2)) + Y = np.sin(X[:, 0:1]) * np.sin(X[:, 1:2]) + np.random.randn(num_samples, 1) * 0.05 + + # construct kernel + rbf = GPy.kern.rbf(2) + + # create simple GP Model + m = GPy.models.SparseGPRegression(X, Y, kernel=rbf, num_inducing=num_inducing) + + # contrain all parameters to be positive (but not inducing inputs) + m['.*len'] = 2. + + m.checkgrad() + + # optimize and plot + m.optimize('tnc', messages=1, max_iters=max_iters) + m.plot() + print(m) + return m + +def uncertain_inputs_sparse_regression(max_iters=100): """Run a 1D example of a sparse GP regression with uncertain inputs.""" - fig, axes = pb.subplots(1,2,figsize=(12,5)) + fig, axes = pb.subplots(1, 2, figsize=(12, 5)) # sample inputs and outputs - S = np.ones((20,1)) - X = np.random.uniform(-3.,3.,(20,1)) - Y = np.sin(X)+np.random.randn(20,1)*0.05 - #likelihood = GPy.likelihoods.Gaussian(Y) - Z = np.random.uniform(-3.,3.,(7,1)) + S = np.ones((20, 1)) + X = np.random.uniform(-3., 3., (20, 1)) + Y = np.sin(X) + np.random.randn(20, 1) * 0.05 + # likelihood = GPy.likelihoods.Gaussian(Y) + Z = np.random.uniform(-3., 3., (7, 1)) - k = GPy.kern.rbf(1) + GPy.kern.white(1) + k = GPy.kern.rbf(1) # create simple GP Model - no input uncertainty on this one m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z) - m.optimize('scg', messages=1, max_f_eval=optim_iters) + m.optimize('scg', messages=1, max_iters=max_iters) m.plot(ax=axes[0]) axes[0].set_title('no input uncertainty') - #the same Model with uncertainty + # the same Model with uncertainty m = GPy.models.SparseGPRegression(X, Y, kernel=k, Z=Z, X_variance=S) - m.optimize('scg', messages=1, max_f_eval=optim_iters) + m.optimize('scg', messages=1, max_iters=max_iters) m.plot(ax=axes[1]) axes[1].set_title('with input uncertainty') print(m) diff --git a/GPy/examples/stochastic.py b/GPy/examples/stochastic.py index 533904d5..21011901 100644 --- a/GPy/examples/stochastic.py +++ b/GPy/examples/stochastic.py @@ -16,6 +16,7 @@ def toy_1d(): m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z) m.constrain_bounded('noise_variance',1e-3,1e-1) + m.constrain_bounded('white_variance',1e-3,1e-1) m.param_steplength = 1e-4 diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py index 433d5f41..589ec4c7 100644 --- a/GPy/inference/optimization.py +++ b/GPy/inference/optimization.py @@ -4,6 +4,7 @@ import pylab as pb import datetime as dt from scipy import optimize +from warnings import warn try: import rasmussens_minimize as rasm @@ -129,7 +130,7 @@ class opt_lbfgsb(Optimizer): opt_dict['pgtol'] = self.gtol opt_result = optimize.fmin_l_bfgs_b(f_fp, self.x_init, iprint=iprint, - maxfun=self.max_f_eval, **opt_dict) + maxfun=self.max_iters, **opt_dict) self.x_opt = opt_result[0] self.f_opt = f_fp(self.x_opt)[0] self.funct_eval = opt_result[2]['funcalls'] @@ -198,17 +199,22 @@ class opt_rasm(Optimizer): class opt_SCG(Optimizer): def __init__(self, *args, **kwargs): + if 'max_f_eval' in kwargs: + warn("max_f_eval deprecated for SCG optimizer: use max_iters instead!\nIgnoring max_f_eval!", FutureWarning) Optimizer.__init__(self, *args, **kwargs) + self.opt_name = "Scaled Conjugate Gradients" def opt(self, f_fp=None, f=None, fp=None): assert not f is None assert not fp is None + opt_result = SCG(f, fp, self.x_init, display=self.messages, maxiters=self.max_iters, max_f_eval=self.max_f_eval, xtol=self.xtol, ftol=self.ftol, gtol=self.gtol) + self.x_opt = opt_result[0] self.trace = opt_result[1] self.f_opt = self.trace[-1] diff --git a/GPy/inference/scg.py b/GPy/inference/scg.py index 5753be7f..f4c7c9c4 100644 --- a/GPy/inference/scg.py +++ b/GPy/inference/scg.py @@ -26,13 +26,16 @@ import numpy as np import sys -def print_out(len_maxiters, display, fnow, current_grad, beta, iteration): - if display: - print '\r', - print '{0:>0{mi}g} {1:> 12e} {2:> 12e} {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, ' Scale:', beta, '\r', - sys.stdout.flush() +def print_out(len_maxiters, fnow, current_grad, beta, iteration): + print '\r', + print '{0:>0{mi}g} {1:> 12e} {2:> 12e} {3:> 12e}'.format(iteration, float(fnow), float(beta), float(current_grad), mi=len_maxiters), # print 'Iteration:', iteration, ' Objective:', fnow, ' Scale:', beta, '\r', + sys.stdout.flush() -def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xtol=None, ftol=None, gtol=None): +def exponents(fnow, current_grad): + exps = [np.abs(fnow), current_grad] + return np.sign(exps) * np.log10(exps).astype(int) + +def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None): """ Optimisation through Scaled Conjugate Gradients (SCG) @@ -52,11 +55,14 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto ftol = 1e-6 if gtol is None: gtol = 1e-5 + sigma0 = 1.0e-8 fold = f(x, *optargs) # Initial function value. function_eval = 1 fnow = fold gradnew = gradf(x, *optargs) # Initial gradient. + if any(np.isnan(gradnew)): + raise UnexpectedInfOrNan current_grad = np.dot(gradnew, gradnew) gradold = gradnew.copy() d = -gradnew # Initial search direction. @@ -64,7 +70,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto nsuccess = 0 # nsuccess counts number of successes. beta = 1.0 # Initial scale parameter. betamin = 1.0e-60 # Lower bound on scale. - betamax = 1.0e100 # Upper bound on scale. + betamax = 1.0e50 # Upper bound on scale. status = "Not converged" flog = [fold] @@ -74,6 +80,8 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto len_maxiters = len(str(maxiters)) if display: print ' {0:{mi}s} {1:11s} {2:11s} {3:11s}'.format("I", "F", "Scale", "|g|", mi=len_maxiters) + exps = exponents(fnow, current_grad) + p_iter = iteration # Main optimization loop. while iteration < maxiters: @@ -103,9 +111,9 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto fnew = f(xnew, *optargs) function_eval += 1 - if function_eval >= max_f_eval: - status = "Maximum number of function evaluations exceeded" - break +# if function_eval >= max_f_eval: +# status = "maximum number of function evaluations exceeded" +# break # return x, flog, function_eval, status Delta = 2.*(fnew - fold) / (alpha * mu) @@ -122,15 +130,28 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto flog.append(fnow) # Current function value iteration += 1 - print_out(len_maxiters, display, fnow, current_grad, beta, iteration) + if display: + print_out(len_maxiters, fnow, current_grad, beta, iteration) + n_exps = exponents(fnow, current_grad) + if iteration - p_iter >= 20 * np.random.rand(): + a = iteration >= p_iter * 2.78 + b = np.any(n_exps < exps) + if a or b: + p_iter = iteration + print '' + if b: + exps = n_exps if success: # Test for termination - if (np.max(np.abs(alpha * d)) < xtol) or (np.abs(fnew - fold) < ftol): - status = 'converged' + + if (np.abs(fnew - fold) < ftol): + status = 'converged - relative reduction in objective' break # return x, flog, function_eval, status - + elif (np.max(np.abs(alpha * d)) < xtol): + status = 'converged - relative stepsize' + break else: # Update variables for new position gradnew = gradf(x, *optargs) @@ -139,7 +160,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto fold = fnew # If the gradient is zero then we are done. if current_grad <= gtol: - status = 'converged' + status = 'converged - relative reduction in gradient' break # return x, flog, function_eval, status @@ -164,6 +185,7 @@ def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=500, display=True, xto status = "maxiter exceeded" if display: - print_out(len_maxiters, display, fnow, current_grad, beta, iteration) + print_out(len_maxiters, fnow, current_grad, beta, iteration) print "" + print status return x, flog, function_eval, status diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index 6d9cf07a..eb4076c3 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) from constructors import * @@ -6,4 +6,4 @@ try: from constructors import rbf_sympy, sympykern # these depend on sympy except: pass -from kern import kern +from kern import * diff --git a/GPy/kern/constructors.py b/GPy/kern/constructors.py index 697f3554..f7c0fd67 100644 --- a/GPy/kern/constructors.py +++ b/GPy/kern/constructors.py @@ -5,6 +5,23 @@ import numpy as np from kern import kern import parts + +def rbf_inv(input_dim,variance=1., inv_lengthscale=None,ARD=False): + """ + Construct an RBF kernel + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param lengthscale: the lengthscale of the kernel + :type lengthscale: float + :param ARD: Auto Relevance Determination (one lengthscale per dimension) + :type ARD: Boolean + """ + part = parts.rbf_inv.RBFInv(input_dim,variance,inv_lengthscale,ARD) + return kern(input_dim, [part]) + def rbf(input_dim,variance=1., lengthscale=None,ARD=False): """ Construct an RBF kernel @@ -34,6 +51,78 @@ def linear(input_dim,variances=None,ARD=False): part = parts.linear.Linear(input_dim,variances,ARD) return kern(input_dim, [part]) +def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False): + """ + Construct an MLP kernel + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param weight_scale: the lengthscale of the kernel + :type weight_scale: vector of weight variances for input weights in neural network (length 1 if kernel is isotropic) + :param bias_variance: the variance of the biases in the neural network. + :type bias_variance: float + :param ARD: Auto Relevance Determination (allows for ARD version of covariance) + :type ARD: Boolean + """ + part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD) + return kern(input_dim, [part]) + +def gibbs(input_dim,variance=1., mapping=None): + """ + Gibbs and MacKay non-stationary covariance function. + + .. math:: + + r = sqrt((x_i - x_j)'*(x_i - x_j)) + + k(x_i, x_j) = \sigma^2*Z*exp(-r^2/(l(x)*l(x) + l(x')*l(x'))) + + Z = \sqrt{2*l(x)*l(x')/(l(x)*l(x) + l(x')*l(x')} + + where :math:`l(x)` is a function giving the length scale as a function of space. + This is the non stationary kernel proposed by Mark Gibbs in his 1997 + thesis. It is similar to an RBF but has a length scale that varies + with input location. This leads to an additional term in front of + the kernel. + + The parameters are :math:`\sigma^2`, the process variance, and the parameters of l(x) which is a function that can be specified by the user, by default an multi-layer peceptron is used is used. + + :param input_dim: the number of input dimensions + :type input_dim: int + :param variance: the variance :math:`\sigma^2` + :type variance: float + :param mapping: the mapping that gives the lengthscale across the input space. + :type mapping: GPy.core.Mapping + :param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension. + :type ARD: Boolean + :rtype: Kernpart object + + """ + part = parts.gibbs.Gibbs(input_dim,variance,mapping) + return kern(input_dim, [part]) + +def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False): + """ + Construct a polynomial kernel + + :param input_dim: dimensionality of the kernel, obligatory + :type input_dim: int + :param variance: the variance of the kernel + :type variance: float + :param weight_scale: the lengthscale of the kernel + :type weight_scale: vector of weight variances for input weights. + :param bias_variance: the variance of the biases. + :type bias_variance: float + :param degree: the degree of the polynomial + :type degree: int + :param ARD: Auto Relevance Determination (allows for ARD version of covariance) + :type ARD: Boolean + """ + part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD) + return kern(input_dim, [part]) + def white(input_dim,variance=1.): """ Construct a white kernel. @@ -227,7 +316,7 @@ def periodic_Matern52(input_dim, variance=1., lengthscale=None, period=2 * np.pi :param n_freq: the number of frequencies considered for the periodic subspace :type n_freq: int """ - part = parts.periodic_Matern52part(input_dim, variance, lengthscale, period, n_freq, lower, upper) + part = parts.periodic_Matern52.PeriodicMatern52(input_dim, variance, lengthscale, period, n_freq, lower, upper) return kern(input_dim, [part]) def prod(k1,k2,tensor=False): @@ -236,21 +325,44 @@ def prod(k1,k2,tensor=False): :param k1, k2: the kernels to multiply :type k1, k2: kernpart + :param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces + :type tensor: Boolean :rtype: kernel object """ - part = parts.prodpart(k1,k2,tensor) + part = parts.prod.Prod(k1, k2, tensor) return kern(part.input_dim, [part]) def symmetric(k): """ - Construct a symmetrical kernel from an existing kernel + Construct a symmetric kernel from an existing kernel """ k_ = k.copy() k_.parts = [symmetric.Symmetric(p) for p in k.parts] return k_ -def coregionalise(Nout,R=1, W=None, kappa=None): - p = parts.coregionalise.Coregionalise(Nout,R,W,kappa) +def coregionalise(output_dim, rank=1, W=None, kappa=None): + """ + Coregionalisation kernel. + + Used for computing covariance functions of the form + .. math:: + k_2(x, y)=\mathbf{B} k(x, y) + where + .. math:: + \mathbf{B} = \mathbf{W}\mathbf{W}^\top + kappa \mathbf{I} + + :param output_dim: the number of output dimensions + :type output_dim: int + :param rank: the rank of the coregionalisation matrix. + :type rank: int + :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B. + :type W: ndarray + :param kappa: a diagonal term which allows the outputs to behave independently. + :rtype: kernel object + + .. Note: see coregionalisation examples in GPy.examples.regression for some usage. + """ + p = parts.coregionalise.Coregionalise(output_dim,rank,W,kappa) return kern(1,[p]) @@ -274,11 +386,13 @@ def fixed(input_dim, K, variance=1.): """ Construct a Fixed effect kernel. - Arguments - --------- - input_dim (int), obligatory - K (np.array), obligatory - variance (float) + :param input_dim: the number of input dimensions + :type input_dim: int (input_dim=1 is the only value currently supported) + :param K: the variance :math:`\sigma^2` + :type K: np.array + :param variance: kernel variance + :type variance: float + :rtype: kern object """ part = parts.fixed.Fixed(input_dim, K, variance) return kern(input_dim, [part]) @@ -296,5 +410,14 @@ def independent_outputs(k): """ for sl in k.input_slices: assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)" - parts = [independent_outputs.IndependentOutputs(p) for p in k.parts] - return kern(k.input_dim+1,parts) + _parts = [parts.independent_outputs.IndependentOutputs(p) for p in k.parts] + return kern(k.input_dim+1,_parts) + +def hierarchical(k): + """ + TODO THis can't be right! Construct a kernel with independent outputs from an existing kernel + """ + # for sl in k.input_slices: + # assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)" + _parts = [parts.hierarchical.Hierarchical(k.parts)] + return kern(k.input_dim+len(k.parts),_parts) diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py index ad3ed3f9..d9928295 100644 --- a/GPy/kern/kern.py +++ b/GPy/kern/kern.py @@ -3,17 +3,21 @@ import numpy as np import pylab as pb -from ..core.parameterised import Parameterised +from ..core.parameterized import Parameterized from parts.kernpart import Kernpart import itertools from parts.prod import Prod as prod +from matplotlib.transforms import offset_copy -class kern(Parameterised): +class kern(Parameterized): def __init__(self, input_dim, parts=[], input_slices=None): """ This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where. - The technical code for kernels is divided into _parts_ (see e.g. rbf.py). This obnject contains a list of parts, which are computed additively. For multiplication, special _prod_ parts are used. + The technical code for kernels is divided into _parts_ (see + e.g. rbf.py). This object contains a list of parts, which are + computed additively. For multiplication, special _prod_ parts + are used. :param input_dim: The dimensionality of the kernel's input space :type input_dim: int @@ -41,26 +45,100 @@ class kern(Parameterised): self.compute_param_slices() - Parameterised.__init__(self) + Parameterized.__init__(self) + + def getstate(self): + """ + Get the current state of the class, + here just all the indices, rest can get recomputed + """ + return Parameterized.getstate(self) + [self.parts, + self.Nparts, + self.num_params, + self.input_dim, + self.input_slices, + self.param_slices + ] + + def setstate(self, state): + self.param_slices = state.pop() + self.input_slices = state.pop() + self.input_dim = state.pop() + self.num_params = state.pop() + self.Nparts = state.pop() + self.parts = state.pop() + Parameterized.setstate(self, state) - def plot_ARD(self, fignum=None, ax=None): - """If an ARD kernel is present, it bar-plots the ARD parameters""" + def plot_ARD(self, fignum=None, ax=None, title='', legend=False): + """If an ARD kernel is present, it bar-plots the ARD parameters, + :param fignum: figure number of the plot + :param ax: matplotlib axis to plot on + :param title: + title of the plot, + pass '' to not print a title + pass None for a generic title + """ if ax is None: fig = pb.figure(fignum) ax = fig.add_subplot(111) + else: + fig = ax.figure + from GPy.util import Tango + from matplotlib.textpath import TextPath + Tango.reset() + xticklabels = [] + bars = [] + x0 = 0 for p in self.parts: + c = Tango.nextMedium() if hasattr(p, 'ARD') and p.ARD: - ax.set_title('ARD parameters, %s kernel' % p.name) - + if title is None: + ax.set_title('ARD parameters, %s kernel' % p.name) + else: + ax.set_title(title) if p.name == 'linear': ard_params = p.variances else: ard_params = 1. / p.lengthscale - ax.bar(np.arange(len(ard_params)) - 0.4, ard_params) - ax.set_xticks(np.arange(len(ard_params))) - ax.set_xticklabels([r"${}$".format(i) for i in range(len(ard_params))]) + x = np.arange(x0, x0 + len(ard_params)) + bars.append(ax.bar(x, ard_params, align='center', color=c, edgecolor='k', linewidth=1.2, label=p.name)) + xticklabels.extend([r"$\mathrm{{{name}}}\ {x}$".format(name=p.name, x=i) for i in np.arange(len(ard_params))]) + x0 += len(ard_params) + x = np.arange(x0) + transOffset = offset_copy(ax.transData, fig=fig, + x=0., y= -2., units='points') + transOffsetUp = offset_copy(ax.transData, fig=fig, + x=0., y=1., units='points') + for bar in bars: + for patch, num in zip(bar.patches, np.arange(len(bar.patches))): + height = patch.get_height() + xi = patch.get_x() + patch.get_width() / 2. + va = 'top' + c = 'w' + t = TextPath((0, 0), "${xi}$".format(xi=xi), rotation=0, usetex=True, ha='center') + transform = transOffset + if patch.get_extents().height <= t.get_extents().height + 3: + va = 'bottom' + c = 'k' + transform = transOffsetUp + ax.text(xi, height, "${xi}$".format(xi=int(num)), color=c, rotation=0, ha='center', va=va, transform=transform) + # for xi, t in zip(x, xticklabels): + # ax.text(xi, maxi / 2, t, rotation=90, ha='center', va='center') + # ax.set_xticklabels(xticklabels, rotation=17) + ax.set_xticks([]) + ax.set_xlim(-.5, x0 - .5) + if legend: + if title is '': + mode = 'expand' + if len(bars) > 1: + mode = 'expand' + ax.legend(bbox_to_anchor=(0., 1.02, 1., 1.02), loc=3, + ncol=len(bars), mode=mode, borderaxespad=0.) + fig.tight_layout(rect=(0, 0, 1, .9)) + else: + ax.legend() return ax def _transform_gradients(self, g): @@ -74,7 +152,7 @@ class kern(Parameterised): return g def compute_param_slices(self): - """create a set of slices that can index the parameters of each part""" + """create a set of slices that can index the parameters of each part.""" self.param_slices = [] count = 0 for p in self.parts: @@ -125,11 +203,19 @@ class kern(Parameterised): """ return self.prod(other) + def __pow__(self, other, tensor=False): + """ + Shortcut for tensor `prod`. + """ + return self.prod(other, tensor=True) + def prod(self, other, tensor=False): """ - multiply two kernels (either on the same space, or on the tensor product of the input space) + multiply two kernels (either on the same space, or on the tensor product of the input space). :param other: the other kernel to be added :type other: GPy.kern + :param tensor: whether or not to use the tensor space (default is false). + :type tensor: bool """ K1 = self.copy() K2 = other.copy() @@ -198,7 +284,7 @@ class kern(Parameterised): [p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)] def _get_param_names(self): - # this is a bit nasty: we wat to distinguish between parts with the same name by appending a count + # this is a bit nasty: we want to distinguish between parts with the same name by appending a count part_names = np.array([k.name for k in self.parts], dtype=np.str) counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)] cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)] @@ -220,11 +306,13 @@ class kern(Parameterised): def dK_dtheta(self, dL_dK, X, X2=None): """ - :param dL_dK: An array of dL_dK derivaties, dL_dK - :type dL_dK: Np.ndarray (N x num_inducing) + Compute the gradient of the covariance function with respect to the parameters. + + :param dL_dK: An array of gradients of the objective function with respect to the covariance function. + :type dL_dK: Np.ndarray (num_samples x num_inducing) :param X: Observed data inputs - :type X: np.ndarray (N x input_dim) - :param X2: Observed dara inputs (optional, defaults to X) + :type X: np.ndarray (num_samples x input_dim) + :param X2: Observed data inputs (optional, defaults to X) :type X2: np.ndarray (num_inducing x input_dim) """ assert X.shape[1] == self.input_dim @@ -237,6 +325,14 @@ class kern(Parameterised): return self._transform_gradients(target) def dK_dX(self, dL_dK, X, X2=None): + """Compute the gradient of the covariance function with respect to X. + + :param dL_dK: An array of gradients of the objective function with respect to the covariance function. + :type dL_dK: np.ndarray (num_samples x num_inducing) + :param X: Observed data inputs + :type X: np.ndarray (num_samples x input_dim) + :param X2: Observed data inputs (optional, defaults to X) + :type X2: np.ndarray (num_inducing x input_dim)""" if X2 is None: X2 = X target = np.zeros_like(X) @@ -247,6 +343,7 @@ class kern(Parameterised): return target def Kdiag(self, X, which_parts='all'): + """Compute the diagonal of the covariance function for inputs X.""" if which_parts == 'all': which_parts = [True] * self.Nparts assert X.shape[1] == self.input_dim @@ -255,6 +352,7 @@ class kern(Parameterised): return target def dKdiag_dtheta(self, dL_dKdiag, X): + """Compute the gradient of the diagonal of the covariance function with respect to the parameters.""" assert X.shape[1] == self.input_dim assert dL_dKdiag.size == X.shape[0] target = np.zeros(self.num_params) @@ -298,16 +396,18 @@ class kern(Parameterised): return target def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S): - """return shapes are N,num_inducing,input_dim""" + """return shapes are num_samples,num_inducing,input_dim""" target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1])) [p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)] return target_mu, target_S def psi2(self, Z, mu, S): """ + Computer the psi2 statistics for the covariance function. + :param Z: np.ndarray of inducing inputs (num_inducing x input_dim) - :param mu, S: np.ndarrays of means and variances (each N x input_dim) - :returns psi2: np.ndarray (N,num_inducing,num_inducing) + :param mu, S: np.ndarrays of means and variances (each num_samples x input_dim) + :returns psi2: np.ndarray (num_samples,num_inducing,num_inducing) """ target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0])) [p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)] @@ -316,21 +416,22 @@ class kern(Parameterised): # TODO: input_slices needed crossterms = 0 - for p1, p2 in itertools.combinations(self.parts, 2): + for [p1, i_s1], [p2, i_s2] in itertools.combinations(zip(self.parts, self.input_slices), 2): + if i_s1 == i_s2: + # TODO psi1 this must be faster/better/precached/more nice + tmp1 = np.zeros((mu.shape[0], Z.shape[0])) + p1.psi1(Z[:, i_s1], mu[:, i_s1], S[:, i_s1], tmp1) + tmp2 = np.zeros((mu.shape[0], Z.shape[0])) + p2.psi1(Z[:, i_s2], mu[:, i_s2], S[:, i_s2], tmp2) + + prod = np.multiply(tmp1, tmp2) + crossterms += prod[:, :, None] + prod[:, None, :] - # TODO psi1 this must be faster/better/precached/more nice - tmp1 = np.zeros((mu.shape[0], Z.shape[0])) - p1.psi1(Z, mu, S, tmp1) - tmp2 = np.zeros((mu.shape[0], Z.shape[0])) - p2.psi1(Z, mu, S, tmp2) - - prod = np.multiply(tmp1, tmp2) - crossterms += prod[:, :, None] + prod[:, None, :] - - target += crossterms - return target + # target += crossterms + return target + crossterms def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S): + """Gradient of the psi2 statistics with respect to the parameters.""" target = np.zeros(self.num_params) [p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)] @@ -434,3 +535,135 @@ class kern(Parameterised): pb.title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1])) else: raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions" + +from GPy.core.model import Model + +class Kern_check_model(Model): + """This is a dummy model class used as a base class for checking that the gradients of a given kernel are implemented correctly. It enables checkgradient() to be called independently on a kernel.""" + def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + num_samples = 20 + num_samples2 = 10 + if kernel==None: + kernel = GPy.kern.rbf(1) + if X==None: + X = np.random.randn(num_samples, kernel.input_dim) + if dL_dK==None: + if X2==None: + dL_dK = np.ones((X.shape[0], X.shape[0])) + else: + dL_dK = np.ones((X.shape[0], X2.shape[0])) + + self.kernel=kernel + self.X = X + self.X2 = X2 + self.dL_dK = dL_dK + #self.constrained_indices=[] + #self.constraints=[] + Model.__init__(self) + + def is_positive_definite(self): + v = np.linalg.eig(self.kernel.K(self.X))[0] + if any(v<0): + return False + else: + return True + + def _get_params(self): + return self.kernel._get_params() + + def _get_param_names(self): + return self.kernel._get_param_names() + + def _set_params(self, x): + self.kernel._set_params(x) + + def log_likelihood(self): + return (self.dL_dK*self.kernel.K(self.X, self.X2)).sum() + + def _log_likelihood_gradients(self): + raise NotImplementedError, "This needs to be implemented to use the kern_check_model class." + +class Kern_check_dK_dtheta(Kern_check_model): + """This class allows gradient checks for the gradient of a kernel with respect to parameters. """ + def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2) + + def _log_likelihood_gradients(self): + return self.kernel.dK_dtheta(self.dL_dK, self.X, self.X2) + +class Kern_check_dKdiag_dtheta(Kern_check_model): + """This class allows gradient checks of the gradient of the diagonal of a kernel with respect to the parameters.""" + def __init__(self, kernel=None, dL_dK=None, X=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None) + if dL_dK==None: + self.dL_dK = np.ones((self.X.shape[0])) + + def log_likelihood(self): + return (self.dL_dK*self.kernel.Kdiag(self.X)).sum() + + def _log_likelihood_gradients(self): + return self.kernel.dKdiag_dtheta(self.dL_dK, self.X) + +class Kern_check_dK_dX(Kern_check_model): + """This class allows gradient checks for the gradient of a kernel with respect to X. """ + def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=X2) + + def _log_likelihood_gradients(self): + return self.kernel.dK_dX(self.dL_dK, self.X, self.X2).flatten() + + def _get_param_names(self): + return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] + + def _get_params(self): + return self.X.flatten() + + def _set_params(self, x): + self.X=x.reshape(self.X.shape) + +class Kern_check_dKdiag_dX(Kern_check_model): + """This class allows gradient checks for the gradient of a kernel diagonal with respect to X. """ + def __init__(self, kernel=None, dL_dK=None, X=None, X2=None): + Kern_check_model.__init__(self,kernel=kernel,dL_dK=dL_dK, X=X, X2=None) + if dL_dK==None: + self.dL_dK = np.ones((self.X.shape[0])) + + def log_likelihood(self): + return (self.dL_dK*self.kernel.Kdiag(self.X)).sum() + + def _log_likelihood_gradients(self): + return self.kernel.dKdiag_dX(self.dL_dK, self.X).flatten() + + def _get_param_names(self): + return ['X_' +str(i) + ','+str(j) for j in range(self.X.shape[1]) for i in range(self.X.shape[0])] + + def _get_params(self): + return self.X.flatten() + + def _set_params(self, x): + self.X=x.reshape(self.X.shape) + +def kern_test(kern, X=None, X2=None, verbose=False): + """This function runs on kernels to check the correctness of their implementation. It checks that the covariance function is positive definite for a randomly generated data set. + + :param kern: the kernel to be tested. + :type kern: GPy.kern.Kernpart + :param X: X input values to test the covariance function. + :type X: ndarray + :param X2: X2 input values to test the covariance function. + :type X2: ndarray + """ + if X==None: + X = np.random.randn(10, kern.input_dim) + if X2==None: + X2 = np.random.randn(20, kern.input_dim) + result = [Kern_check_model(kern, X=X).is_positive_definite(), + Kern_check_dK_dtheta(kern, X=X, X2=X2).checkgrad(verbose=verbose), + Kern_check_dK_dtheta(kern, X=X, X2=None).checkgrad(verbose=verbose), + Kern_check_dKdiag_dtheta(kern, X=X).checkgrad(verbose=verbose), + Kern_check_dK_dX(kern, X=X, X2=X2).checkgrad(verbose=verbose), + Kern_check_dKdiag_dX(kern, X=X).checkgrad(verbose=verbose)] + # Need to check + #Kern_check_dK_dX(kern, X, X2=None).checkgrad(verbose=verbose)] + # but currently I think these aren't implemented. + return np.all(result) diff --git a/GPy/kern/parts/__init__.py b/GPy/kern/parts/__init__.py index 68762afc..053e280f 100644 --- a/GPy/kern/parts/__init__.py +++ b/GPy/kern/parts/__init__.py @@ -4,13 +4,16 @@ import coregionalise import exponential import finite_dimensional import fixed +import gibbs import independent_outputs import linear import Matern32 import Matern52 +import mlp import periodic_exponential import periodic_Matern32 import periodic_Matern52 +import poly import prod_orthogonal import prod import rational_quadratic @@ -19,3 +22,5 @@ import rbf import spline import symmetric import white +import hierarchical +import rbf_inv diff --git a/GPy/kern/parts/coregionalise.py b/GPy/kern/parts/coregionalise.py index 8faceafe..94179088 100644 --- a/GPy/kern/parts/coregionalise.py +++ b/GPy/kern/parts/coregionalise.py @@ -9,24 +9,42 @@ from scipy import weave class Coregionalise(Kernpart): """ - Kernel for Intrinsic Corregionalization Models + Coregionalisation kernel. + + Used for computing covariance functions of the form + .. math:: + k_2(x, y)=B k(x, y) + where + .. math:: + B = WW^\top + diag(kappa) + + :param output_dim: the number of output dimensions + :type output_dim: int + :param rank: the rank of the coregionalisation matrix. + :type rank: int + :param W: a low rank matrix that determines the correlations between the different outputs, together with kappa it forms the coregionalisation matrix B. + :type W: ndarray + :param kappa: a diagonal term which allows the outputs to behave independently. + :rtype: kernel object + + .. Note: see coregionalisation examples in GPy.examples.regression for some usage. """ - def __init__(self,Nout,R=1, W=None, kappa=None): + def __init__(self,output_dim,rank=1, W=None, kappa=None): self.input_dim = 1 self.name = 'coregion' - self.Nout = Nout - self.R = R + self.output_dim = output_dim + self.rank = rank if W is None: - self.W = np.ones((self.Nout,self.R)) + self.W = 0.5*np.random.randn(self.output_dim,self.rank)/np.sqrt(self.rank) else: - assert W.shape==(self.Nout,self.R) + assert W.shape==(self.output_dim,self.rank) self.W = W if kappa is None: - kappa = np.ones(self.Nout) + kappa = 0.5*np.ones(self.output_dim) else: - assert kappa.shape==(self.Nout,) + assert kappa.shape==(self.output_dim,) self.kappa = kappa - self.num_params = self.Nout*(self.R + 1) + self.num_params = self.output_dim*(self.rank + 1) self._set_params(np.hstack([self.W.flatten(),self.kappa])) def _get_params(self): @@ -34,12 +52,12 @@ class Coregionalise(Kernpart): def _set_params(self,x): assert x.size == self.num_params - self.kappa = x[-self.Nout:] - self.W = x[:-self.Nout].reshape(self.Nout,self.R) + self.kappa = x[-self.output_dim:] + self.W = x[:-self.output_dim].reshape(self.output_dim,self.rank) self.B = np.dot(self.W,self.W.T) + np.diag(self.kappa) def _get_param_names(self): - return sum([['W%i_%i'%(i,j) for j in range(self.R)] for i in range(self.Nout)],[]) + ['kappa_%i'%i for i in range(self.Nout)] + return sum([['W%i_%i'%(i,j) for j in range(self.rank)] for i in range(self.output_dim)],[]) + ['kappa_%i'%i for i in range(self.output_dim)] def K(self,index,index2,target): index = np.asarray(index,dtype=np.int) @@ -57,26 +75,26 @@ class Coregionalise(Kernpart): if index2 is None: code=""" for(int i=0;i