diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py index 25651827..fb40a9e0 100644 --- a/GPy/core/__init__.py +++ b/GPy/core/__init__.py @@ -8,5 +8,4 @@ from parameterization.observable_array import ObsAr from gp import GP from sparse_gp import SparseGP -from svigp import SVIGP from mapping import * diff --git a/GPy/core/gp.py b/GPy/core/gp.py index 0b59c0ea..d3ebd17c 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -3,16 +3,12 @@ import numpy as np import sys -import warnings from .. import kern -from ..util.linalg import dtrtrs from model import Model from parameterization import ObsAr from .. import likelihoods -from ..likelihoods.gaussian import Gaussian -from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation, LatentFunctionInference +from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation from parameterization.variational import VariationalPosterior -from scipy.sparse.base import issparse import logging from GPy.util.normalizer import MeanNorm @@ -26,7 +22,7 @@ class GP(Model): :param Y: output observations :param kernel: a GPy kernel, defaults to rbf+white :param likelihood: a GPy likelihood - :param :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference_method: The inference method to use for this GP + :param inference_method: The :class:`~GPy.inference.latent_function_inference.LatentFunctionInference` inference method to use for this GP :rtype: model object :param Norm normalizer: normalize the outputs Y. @@ -95,10 +91,13 @@ class GP(Model): def set_XY(self, X=None, Y=None): """ - Set the input / output of the model + Set the input / output data of the model + This is useful if we wish to change our existing data but maintain the same model :param X: input observations + :type X: np.ndarray :param Y: output observations + :type Y: np.ndarray """ self.update_model(False) if Y is not None: @@ -112,7 +111,6 @@ class GP(Model): if X is not None: if self.X in self.parameters: # LVM models - from ..core.parameterization.variational import VariationalPosterior if isinstance(self.X, VariationalPosterior): assert isinstance(X, type(self.X)), "The given X must have the same type as the X in the model!" self.unlink_parameter(self.X) @@ -129,22 +127,39 @@ class GP(Model): def set_X(self,X): """ - Set the input of the model + Set the input data of the model + + :param X: input observations + :type X: np.ndarray """ self.set_XY(X=X) def set_Y(self,Y): """ - Set the input of the model + Set the output data of the model + + :param X: output observations + :type X: np.ndarray """ self.set_XY(Y=Y) def parameters_changed(self): + """ + Method that is called upon any changes to :class:`~GPy.core.parameterization.param.Param` variables within the model. + In particular in the GP class this method reperforms inference, recalculating the posterior and log marginal likelihood and gradients of the model + + .. warning:: + This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call + this method yourself, there may be unexpected consequences. + """ self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata) self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X) def log_likelihood(self): + """ + The log marginal likelihood of the model, :math:`p(\mathbf{y})`, this is the objective function of the model being optimised + """ return self._log_marginal_likelihood def _raw_predict(self, _Xnew, full_cov=False, kern=None): @@ -155,12 +170,10 @@ class GP(Model): of the prediction is computed. If full_cov is False (default), only the diagonal of the covariance is returned. - $$ - p(f*|X*, X, Y) = \int^{\inf}_{\inf} p(f*|f,X*)p(f|X,Y) df - = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*} - \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance} - $$ - + .. math:: + p(f*|X*, X, Y) = \int^{\inf}_{\inf} p(f*|f,X*)p(f|X,Y) df + = N(f*| K_{x*x}(K_{xx} + \Sigma)^{-1}Y, K_{x*x*} - K_{xx*}(K_{xx} + \Sigma)^{-1}K_{xx*} + \Sigma := \texttt{Likelihood.variance / Approximate likelihood covariance} """ if kern is None: kern = self.kern @@ -185,23 +198,20 @@ class GP(Model): Predict the function(s) at the new point(s) Xnew. :param Xnew: The points at which to make a prediction - :type Xnew: np.ndarray, Nnew x self.input_dim + :type Xnew: np.ndarray (Nnew x self.input_dim) :param full_cov: whether to return the full covariance matrix, or just the diagonal :type full_cov: bool :param Y_metadata: metadata about the predicting point to pass to the likelihood :param kern: The kernel to use for prediction (defaults to the model kern). this is useful for examining e.g. subprocesses. - :returns: mean: posterior mean, a Numpy array, Nnew x self.input_dim - :returns: var: posterior variance, a Numpy array, Nnew x 1 if - full_cov=False, Nnew x Nnew otherwise - :returns: lower and upper boundaries of the 95% confidence intervals, - Numpy arrays, Nnew x self.input_dim - + :returns: (mean, var, lower_upper): + mean: posterior mean, a Numpy array, Nnew x self.input_dim + var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise + lower_upper: lower and upper boundaries of the 95% confidence intervals, Numpy arrays, Nnew x self.input_dim If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew. This is to allow for different normalizations of the output dimensions. - """ #predict the latent function values mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern) @@ -213,6 +223,16 @@ class GP(Model): return mean, var def predict_quantiles(self, X, quantiles=(2.5, 97.5), Y_metadata=None): + """ + Get the predictive quantiles around the prediction at X + + :param X: The points at which to make a prediction + :type X: np.ndarray (Xnew x self.input_dim) + :param quantiles: tuple of quantiles, default is (2.5, 97.5) which is the 95% interval + :type quantiles: tuple + :returns: list of quantiles for each X and predictive quantiles for interval combination + :rtype: [np.ndarray (Xnew x self.input_dim), np.ndarray (Xnew x self.input_dim)] + """ m, v = self._raw_predict(X, full_cov=False) if self.normalizer is not None: m, v = self.normalizer.inverse_mean(m), self.normalizer.inverse_variance(v) @@ -225,7 +245,12 @@ class GP(Model): Given a set of points at which to predict X* (size [N*,Q]), compute the derivatives of the mean and variance. Resulting arrays are sized: dmu_dX* -- [N*, Q ,D], where D is the number of output in this GP (usually one). + dv_dX* -- [N*, Q], (since all outputs have the same variance) + :param X: The points at which to get the predictive gradients + :type X: np.ndarray (Xnew x self.input_dim) + :returns: dmu_dX, dv_dX + :rtype: [np.ndarray (N*, Q ,D), np.ndarray (N*,Q) ] """ dmu_dX = np.empty((Xnew.shape[0],Xnew.shape[1],self.output_dim)) @@ -245,12 +270,13 @@ class GP(Model): Samples the posterior GP at the points X. :param X: The points at which to take the samples. - :type X: np.ndarray, Nnew x self.input_dim. + :type X: np.ndarray (Nnew x self.input_dim) :param size: the number of a posteriori samples. :type size: int. :param full_cov: whether to return the full covariance matrix, or just the diagonal. :type full_cov: bool. - :returns: Ysim: set of simulations, a Numpy array (N x samples). + :returns: Ysim: set of simulations + :rtype: np.ndarray (N x samples) """ m, v = self._raw_predict(X, full_cov=full_cov) if self.normalizer is not None: @@ -268,7 +294,7 @@ class GP(Model): Samples the posterior GP at the points X. :param X: the points at which to take the samples. - :type X: np.ndarray, Nnew x self.input_dim. + :type X: np.ndarray (Nnew x self.input_dim.) :param size: the number of a posteriori samples. :type size: int. :param full_cov: whether to return the full covariance matrix, or just the diagonal. @@ -292,6 +318,37 @@ class GP(Model): This is a call to plot with plot_raw=True. Data will not be plotted in this, as the GP's view of the world may live in another space, or units then the data. + + Can plot only part of the data and part of the posterior functions + using which_data_rowsm which_data_ycols. + + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :type plot_limits: np.array + :param which_data_rows: which of the training data to plot (default all) + :type which_data_rows: 'all' or a slice object to slice model.X, model.Y + :param which_data_ycols: when the data has several columns (independant outputs), only plot these + :type which_data_ycols: 'all' or a list of integers + :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v. + :type fixed_inputs: a list of tuples + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + :type resolution: int + :param levels: number of levels to plot in a contour plot. + :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure + :type levels: int + :param samples: the number of a posteriori samples to plot + :type samples: int + :param fignum: figure to plot on. + :type fignum: figure number + :param ax: axes to plot on. + :type ax: axes handle + :param linecol: color of line to plot [Tango.colorsHex['darkBlue']] + :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib + :param fillcol: color of fill [Tango.colorsHex['lightBlue']] + :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib + :param Y_metadata: additional data associated with Y which may be needed + :type Y_metadata: dict + :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx') + :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib. """ assert "matplotlib" in sys.modules, "matplotlib package has not been imported." from ..plotting.matplot_dep import models_plots @@ -325,12 +382,13 @@ class GP(Model): :param which_data_rows: which of the training data to plot (default all) :type which_data_rows: 'all' or a slice object to slice model.X, model.Y :param which_data_ycols: when the data has several columns (independant outputs), only plot these - :type which_data_rows: 'all' or a list of integers + :type which_data_ycols: 'all' or a list of integers :param fixed_inputs: a list of tuple [(i,v), (i,v)...], specifying that input index i should be set to value v. :type fixed_inputs: a list of tuples :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D :type resolution: int :param levels: number of levels to plot in a contour plot. + :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure :type levels: int :param samples: the number of a posteriori samples to plot :type samples: int @@ -338,11 +396,14 @@ class GP(Model): :type fignum: figure number :param ax: axes to plot on. :type ax: axes handle - :type output: integer (first output is 0) :param linecol: color of line to plot [Tango.colorsHex['darkBlue']] - :type linecol: + :type linecol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib :param fillcol: color of fill [Tango.colorsHex['lightBlue']] - :param levels: for 2D plotting, the number of contour levels to use is ax is None, create a new figure + :type fillcol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) as is standard in matplotlib + :param Y_metadata: additional data associated with Y which may be needed + :type Y_metadata: dict + :param data_symbol: symbol as used matplotlib, by default this is a black cross ('kx') + :type data_symbol: color either as Tango.colorsHex object or character ('r' is red, 'g' is green) alongside marker type, as is standard in matplotlib. """ assert "matplotlib" in sys.modules, "matplotlib package has not been imported." from ..plotting.matplot_dep import models_plots @@ -372,10 +433,8 @@ class GP(Model): :type max_f_eval: int :messages: whether to display during optimisation :type messages: bool - :param optimizer: which optimizer to use (defaults to self.preferred optimizer) + :param optimizer: which optimizer to use (defaults to self.preferred optimizer), a range of optimisers can be found in :module:`~GPy.inference.optimization`, they include 'scg', 'lbfgs', 'tnc'. :type optimizer: string - - TODO: valid args """ self.inference_method.on_optimization_start() try: @@ -394,7 +453,11 @@ class GP(Model): :param optimize: whether to optimize the location of new X (True by default) :type optimize: boolean :return: a tuple containing the posterior estimation of X and the model that optimize X +<<<<<<< HEAD :rtype: (GPy.core.parameterization.variational.VariationalPosterior or numpy.ndarray, GPy.core.Model) +======= + :rtype: (:class:`~GPy.core.parameterization.variational.VariationalPosterior` or numpy.ndarray, :class:`~GPy.core.model.Model`) +>>>>>>> 22d30d9d39c70f806fe5bcb815cce9c8eb0f8dca """ from ..inference.latent_function_inference.inferenceX import infer_newX return infer_newX(self, Y_new, optimize=optimize) diff --git a/GPy/core/model.py b/GPy/core/model.py index 355087ca..029b8fc8 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -234,9 +234,9 @@ class Model(Parameterized): """ if self.is_fixed: - raise RuntimeError, "Cannot optimize, when everything is fixed" + print 'nothing to optimize' if self.size == 0: - raise RuntimeError, "Model without parameters cannot be optimized" + print 'nothing to optimize' if start == None: start = self.optimizer_array diff --git a/GPy/core/sparse_gp_mpi.py b/GPy/core/sparse_gp_mpi.py index e9bd770d..e8779f51 100644 --- a/GPy/core/sparse_gp_mpi.py +++ b/GPy/core/sparse_gp_mpi.py @@ -47,7 +47,7 @@ class SparseGP_MPI(SparseGP): if variational_prior is not None: self.link_parameter(variational_prior) - + self.mpi_comm = mpi_comm # Manage the data (Y) division if mpi_comm != None: @@ -60,7 +60,6 @@ class SparseGP_MPI(SparseGP): mpi_comm.Bcast(self.param_array, root=0) self.update_model(True) - def __getstate__(self): dc = super(SparseGP_MPI, self).__getstate__() dc['mpi_comm'] = None diff --git a/GPy/core/svigp.py b/GPy/core/svigp.py deleted file mode 100644 index f78618eb..00000000 --- a/GPy/core/svigp.py +++ /dev/null @@ -1,434 +0,0 @@ -# Copyright (c) 2012, James Hensman and Nicolo' Fusi -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -import numpy as np -from ..util.linalg import pdinv, mdot, tdot, dpotrs, dtrtrs, jitchol, backsub_both_sides -from gp import GP -import time -import sys - - -class SVIGP(GP): - """ - - Stochastic Variational inference in a Gaussian Process - - :param X: inputs - :type X: np.ndarray (N x Q) - :param Y: observed data - :type Y: np.ndarray of observations (N x D) - :param batchsize: the size of a h - - Additional kwargs are used as for a sparse GP. They include: - - :param q_u: canonical parameters of the distribution sqehd into a 1D array - :type q_u: np.ndarray - :param M: Number of inducing points (optional, default 10. Ignored if Z is not None) - :type M: int - :param kernel: the kernel/covariance function. See link kernels - :type kernel: a GPy kernel - :param Z: inducing inputs (optional, see note) - :type Z: np.ndarray (M x Q) | None - :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) - :type X_uncertainty: np.ndarray (N x Q) | None - :param Zslices: slices for the inducing inputs (see slicing TODO: link) - :param M: Number of inducing points (optional, default 10. Ignored if Z is not None) - :type M: int - :param beta: noise precision. TODO: ignore beta if doing EP - :type beta: float - :param normalize_(X|Y): whether to normalize the data before computing (predictions will be in original scales) - :type normalize_(X|Y): bool - - """ - - - def __init__(self, X, Y, kernel, Z, q_u=None, batchsize=10): - raise NotImplementedError, "This is a work in progress, see github issue " - GP.__init__(self, X, Y, kernel) - self.batchsize=batchsize - self.Z = Z - self.num_inducing = Z.shape[0] - - self.batchcounter = 0 - self.epochs = 0 - self.iterations = 0 - - self.vb_steplength = 0.05 - self.param_steplength = 1e-5 - self.momentum = 0.9 - - if q_u is None: - q_u = np.hstack((np.random.randn(self.num_inducing*self.output_dim),-.5*np.eye(self.num_inducing).flatten())) - - self.set_vb_param(q_u) - - self._permutation = np.random.permutation(self.num_data) - self.load_batch() - - self._param_trace = [] - self._ll_trace = [] - self._grad_trace = [] - - #set the adaptive steplength parameters - #self.hbar_t = 0.0 - #self.tau_t = 100.0 - #self.gbar_t = 0.0 - #self.gbar_t1 = 0.0 - #self.gbar_t2 = 0.0 - #self.hbar_tp = 0.0 - #self.tau_tp = 10000.0 - #self.gbar_tp = 0.0 - #self.adapt_param_steplength = True - #self.adapt_vb_steplength = True - #self._param_steplength_trace = [] - #self._vb_steplength_trace = [] - - def _compute_kernel_matrices(self): - # kernel computations, using BGPLVM notation - self.Kmm = self.kern.K(self.Z) - self.psi0 = self.kern.Kdiag(self.X_batch) - self.psi1 = self.kern.K(self.X_batch, self.Z) - self.psi2 = None - - def dL_dtheta(self): - dL_dtheta = self.kern._param_grad_helper(self.dL_dKmm, self.Z) - if self.has_uncertain_inputs: - dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z, self.X_batch, self.X_variance_batch) - dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1, self.Z, self.X_batch, self.X_variance_batch) - dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2, self.Z, self.X_batch, self.X_variance_batch) - else: - dL_dtheta += self.kern._param_grad_helper(self.dL_dpsi1, self.X_batch, self.Z) - dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X_batch) - return dL_dtheta - - def _set_params(self, p, computations=True): - self.kern._set_params_transformed(p[:self.kern.num_params]) - self.likelihood._set_params(p[self.kern.num_params:]) - if computations: - self._compute_kernel_matrices() - self._computations() - - def _get_params(self): - return np.hstack((self.kern._get_params_transformed() , self.likelihood._get_params())) - - def _get_param_names(self): - return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() - - def load_batch(self): - """ - load a batch of data (set self.X_batch and self.Y_batch from self.X, self.Y) - """ - - #if we've seen all the data, start again with them in a new random order - if self.batchcounter+self.batchsize > self.num_data: - self.batchcounter = 0 - self.epochs += 1 - self._permutation = np.random.permutation(self.num_data) - - this_perm = self._permutation[self.batchcounter:self.batchcounter+self.batchsize] - - self.X_batch = self.X[this_perm] - self.Y_batch = self.Y[this_perm] - - self.batchcounter += self.batchsize - - self.data_prop = float(self.batchsize)/self.num_data - - self._compute_kernel_matrices() - self._computations() - - def _computations(self,do_Kmm=True, do_Kmm_grad=True): - """ - All of the computations needed. Some are optional, see kwargs. - """ - - if do_Kmm: - self.Lm = jitchol(self.Kmm) - - # The rather complex computations of self.A - if self.has_uncertain_inputs: - if self.likelihood.is_heteroscedastic: - psi2_beta = (self.psi2 * (self.likelihood.precision.flatten().reshape(self.batchsize, 1, 1))).sum(0) - else: - psi2_beta = self.psi2.sum(0) * self.likelihood.precision - evals, evecs = np.linalg.eigh(psi2_beta) - clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable - tmp = evecs * np.sqrt(clipped_evals) - else: - if self.likelihood.is_heteroscedastic: - tmp = self.psi1.T * (np.sqrt(self.likelihood.precision.flatten().reshape(1, self.batchsize))) - else: - tmp = self.psi1.T * (np.sqrt(self.likelihood.precision)) - tmp, _ = dtrtrs(self.Lm, np.asfortranarray(tmp), lower=1) - self.A = tdot(tmp) - - self.V = self.likelihood.precision*self.likelihood.Y - self.VmT = np.dot(self.V,self.q_u_expectation[0].T) - self.psi1V = np.dot(self.psi1.T, self.V) - - self.B = np.eye(self.num_inducing)*self.data_prop + self.A - self.Lambda = backsub_both_sides(self.Lm, self.B.T) - self.LQL = backsub_both_sides(self.Lm,self.q_u_expectation[1].T,transpose='right') - - self.trace_K = self.psi0.sum() - np.trace(self.A)/self.likelihood.precision - self.Kmmi_m, _ = dpotrs(self.Lm, self.q_u_expectation[0], lower=1) - self.projected_mean = np.dot(self.psi1,self.Kmmi_m) - - # Compute dL_dpsi - self.dL_dpsi0 = - 0.5 * self.output_dim * self.likelihood.precision * np.ones(self.batchsize) - self.dL_dpsi1, _ = dpotrs(self.Lm,np.asfortranarray(self.VmT.T),lower=1) - self.dL_dpsi1 = self.dL_dpsi1.T - - dL_dpsi2 = -0.5 * self.likelihood.precision * backsub_both_sides(self.Lm, self.LQL - self.output_dim * np.eye(self.num_inducing)) - if self.has_uncertain_inputs: - self.dL_dpsi2 = np.repeat(dL_dpsi2[None,:,:],self.batchsize,axis=0) - else: - self.dL_dpsi1 += 2.*np.dot(dL_dpsi2,self.psi1.T).T - self.dL_dpsi2 = None - - # Compute dL_dKmm - if do_Kmm_grad: - tmp = np.dot(self.LQL,self.A) - backsub_both_sides(self.Lm,np.dot(self.q_u_expectation[0],self.psi1V.T),transpose='right') - tmp += tmp.T - tmp += -self.output_dim*self.B - tmp += self.data_prop*self.LQL - self.dL_dKmm = 0.5*backsub_both_sides(self.Lm,tmp) - - #Compute the gradient of the log likelihood wrt noise variance - self.partial_for_likelihood = -0.5*(self.batchsize*self.output_dim - np.sum(self.A*self.LQL))*self.likelihood.precision - self.partial_for_likelihood += (0.5*self.output_dim*self.trace_K + 0.5 * self.likelihood.trYYT - np.sum(self.likelihood.Y*self.projected_mean))*self.likelihood.precision**2 - - - def log_likelihood(self): - """ - As for uncollapsed sparse GP, but account for the proportion of data we're looking at right now. - - NB. self.batchsize is the size of the batch, not the size of X_all - """ - assert not self.likelihood.is_heteroscedastic - A = -0.5*self.batchsize*self.output_dim*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) - B = -0.5*self.likelihood.precision*self.output_dim*self.trace_K - Kmm_logdet = 2.*np.sum(np.log(np.diag(self.Lm))) - C = -0.5*self.output_dim*self.data_prop*(Kmm_logdet-self.q_u_logdet - self.num_inducing) - C += -0.5*np.sum(self.LQL * self.B) - D = -0.5*self.likelihood.precision*self.likelihood.trYYT - E = np.sum(self.V*self.projected_mean) - return (A+B+C+D+E)/self.data_prop - - def _log_likelihood_gradients(self): - return np.hstack((self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood)))/self.data_prop - - def vb_grad_natgrad(self): - """ - Compute the gradients of the lower bound wrt the canonical and - Expectation parameters of u. - - Note that the natural gradient in either is given by the gradient in - the other (See Hensman et al 2012 Fast Variational inference in the - conjugate exponential Family) - """ - - # Gradient for eta - dL_dmmT_S = -0.5*self.Lambda/self.data_prop + 0.5*self.q_u_prec - Kmmipsi1V,_ = dpotrs(self.Lm,self.psi1V,lower=1) - dL_dm = (Kmmipsi1V - np.dot(self.Lambda,self.q_u_mean))/self.data_prop - - # Gradients for theta - S = self.q_u_cov - Si = self.q_u_prec - m = self.q_u_mean - dL_dSi = -mdot(S,dL_dmmT_S, S) - - dL_dmhSi = -2*dL_dSi - dL_dSim = np.dot(dL_dSi,m) + np.dot(Si, dL_dm) - - return np.hstack((dL_dm.flatten(),dL_dmmT_S.flatten())) , np.hstack((dL_dSim.flatten(), dL_dmhSi.flatten())) - - - def optimize(self, iterations, print_interval=10, callback=lambda:None, callback_interval=5): - - param_step = 0. - - #Iterate! - for i in range(iterations): - - #store the current configuration for plotting later - self._param_trace.append(self._get_params()) - self._ll_trace.append(self.log_likelihood() + self.log_prior()) - - #load a batch - self.load_batch() - - #compute the (stochastic) gradient - natgrads = self.vb_grad_natgrad() - grads = self._transform_gradients(self._log_likelihood_gradients() + self._log_prior_gradients()) - self._grad_trace.append(grads) - - #compute the steps in all parameters - vb_step = self.vb_steplength*natgrads[0] - if (self.epochs>=1):#only move the parameters after the first epoch - param_step = self.momentum*param_step + self.param_steplength*grads - else: - param_step = 0. - - self.set_vb_param(self.get_vb_param() + vb_step) - #Note: don't recompute everything here, wait until the next iteration when we have a new batch - self._set_params(self._untransform_params(self._get_params_transformed() + param_step), computations=False) - - #print messages if desired - if i and (not i%print_interval): - print i, np.mean(self._ll_trace[-print_interval:]) #, self.log_likelihood() - print np.round(np.mean(self._grad_trace[-print_interval:],0),3) - sys.stdout.flush() - - #callback - if i and not i%callback_interval: - callback(self) # Change this to callback() - time.sleep(0.01) - - if self.epochs > 10: - self._adapt_steplength() - - self.iterations += 1 - - - def _adapt_steplength(self): - if self.adapt_vb_steplength: - # self._adaptive_vb_steplength() - self._adaptive_vb_steplength_KL() - self._vb_steplength_trace.append(self.vb_steplength) - assert self.vb_steplength > 0 - - if self.adapt_param_steplength: - self._adaptive_param_steplength() - # self._adaptive_param_steplength_log() - # self._adaptive_param_steplength_from_vb() - self._param_steplength_trace.append(self.param_steplength) - - def _adaptive_param_steplength(self): - decr_factor = 0.02 - g_tp = self._transform_gradients(self._log_likelihood_gradients()) - self.gbar_tp = (1-1/self.tau_tp)*self.gbar_tp + 1/self.tau_tp * g_tp - self.hbar_tp = (1-1/self.tau_tp)*self.hbar_tp + 1/self.tau_tp * np.dot(g_tp.T, g_tp) - new_param_steplength = np.dot(self.gbar_tp.T, self.gbar_tp) / self.hbar_tp - #- hack - new_param_steplength *= decr_factor - self.param_steplength = (self.param_steplength + new_param_steplength)/2 - #- - self.tau_tp = self.tau_tp*(1-self.param_steplength) + 1 - - def _adaptive_param_steplength_log(self): - stp = np.logspace(np.log(0.0001), np.log(1e-6), base=np.e, num=18000) - self.param_steplength = stp[self.iterations] - - def _adaptive_param_steplength_log2(self): - self.param_steplength = (self.iterations + 0.001)**-0.5 - - def _adaptive_param_steplength_from_vb(self): - self.param_steplength = self.vb_steplength * 0.01 - - def _adaptive_vb_steplength(self): - decr_factor = 0.1 - g_t = self.vb_grad_natgrad()[0] - self.gbar_t = (1-1/self.tau_t)*self.gbar_t + 1/self.tau_t * g_t - self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t.T, g_t) - new_vb_steplength = np.dot(self.gbar_t.T, self.gbar_t) / self.hbar_t - #- hack - new_vb_steplength *= decr_factor - self.vb_steplength = (self.vb_steplength + new_vb_steplength)/2 - #- - self.tau_t = self.tau_t*(1-self.vb_steplength) + 1 - - def _adaptive_vb_steplength_KL(self): - decr_factor = 0.1 - natgrad = self.vb_grad_natgrad() - g_t1 = natgrad[0] - g_t2 = natgrad[1] - self.gbar_t1 = (1-1/self.tau_t)*self.gbar_t1 + 1/self.tau_t * g_t1 - self.gbar_t2 = (1-1/self.tau_t)*self.gbar_t2 + 1/self.tau_t * g_t2 - self.hbar_t = (1-1/self.tau_t)*self.hbar_t + 1/self.tau_t * np.dot(g_t1.T, g_t2) - self.vb_steplength = np.dot(self.gbar_t1.T, self.gbar_t2) / self.hbar_t - self.vb_steplength *= decr_factor - self.tau_t = self.tau_t*(1-self.vb_steplength) + 1 - - def _raw_predict(self, X_new, X_variance_new=None, which_parts='all',full_cov=False): - """Internal helper function for making predictions, does not account for normalization""" - - #TODO: make this more efficient! - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - tmp = self.Kmmi- mdot(self.Kmmi,self.q_u_cov,self.Kmmi) - - if X_variance_new is None: - Kx = self.kern.K(X_new,self.Z) - mu = np.dot(Kx,self.Kmmi_m) - if full_cov: - Kxx = self.kern.K(X_new) - var = Kxx - mdot(Kx,tmp,Kx.T) - else: - Kxx = self.kern.Kdiag(X_new) - var = (Kxx - np.sum(Kx*np.dot(Kx,tmp),1))[:,None] - return mu, var - else: - assert X_variance_new.shape == X_new.shape - Kx = self.kern.psi1(self.Z,X_new, X_variance_new) - mu = np.dot(Kx,self.Kmmi_m) - Kxx = self.kern.psi0(self.Z,X_new,X_variance_new) - psi2 = self.kern.psi2(self.Z,X_new,X_variance_new) - diag_var = Kxx - np.sum(np.sum(psi2*tmp[None,:,:],1),1) - if full_cov: - raise NotImplementedError - else: - return mu, diag_var[:,None] - - def predict(self, Xnew, X_variance_new=None, which_parts='all', full_cov=False): - # normalize X values - Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale - if X_variance_new is not None: - X_variance_new = X_variance_new / self._Xscale ** 2 - - # here's the actual prediction by the GP model - mu, var = self._raw_predict(Xnew, X_variance_new, full_cov=full_cov, which_parts=which_parts) - - # now push through likelihood - mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) - - return mean, var, _025pm, _975pm - - - def set_vb_param(self,vb_param): - """set the distribution q(u) from the canonical parameters""" - self.q_u_canonical_flat = vb_param.copy() - self.q_u_canonical = self.q_u_canonical_flat[:self.num_inducing*self.output_dim].reshape(self.num_inducing,self.output_dim),self.q_u_canonical_flat[self.num_inducing*self.output_dim:].reshape(self.num_inducing,self.num_inducing) - - self.q_u_prec = -2.*self.q_u_canonical[1] - self.q_u_cov, q_u_Li, q_u_L, tmp = pdinv(self.q_u_prec) - self.q_u_Li = q_u_Li - self.q_u_logdet = -tmp - self.q_u_mean, _ = dpotrs(q_u_Li, np.asfortranarray(self.q_u_canonical[0]),lower=1) - - self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov*self.output_dim) - - - def get_vb_param(self): - """ - Return the canonical parameters of the distribution q(u) - """ - return self.q_u_canonical_flat - - - def plot(self, *args, **kwargs): - """ - See GPy.plotting.matplot_dep.svig_plots.plot - """ - assert "matplotlib" in sys.modules, "matplotlib package has not been imported." - from ..plotting.matplot_dep import svig_plots - svig_plots.plot(self,*args,**kwargs) - - - def plot_traces(self): - """ - See GPy.plotting.matplot_dep.svig_plots.plot_traces - """ - assert "matplotlib" in sys.modules, "matplotlib package has not been imported." - from ..plotting.matplot_dep import svig_plots - svig_plots.plot_traces(self) diff --git a/GPy/examples/__init__.py b/GPy/examples/__init__.py index c575bb33..93994175 100644 --- a/GPy/examples/__init__.py +++ b/GPy/examples/__init__.py @@ -4,6 +4,4 @@ import classification import regression import dimensionality_reduction -import tutorials -import stochastic import non_gaussian diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index cf71be24..b9d488d6 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -28,13 +28,13 @@ def oil(num_inducing=50, max_iters=100, kernel=None, optimize=True, plot=True): m = GPy.models.SparseGPClassification(X, Y, kernel=kernel, num_inducing=num_inducing) # Contrain all parameters to be positive - m.tie_params('.*len') + #m.tie_params('.*len') m['.*len'] = 10. - m.update_likelihood_approximation() # Optimize if optimize: - m.optimize(max_iters=max_iters) + for _ in range(5): + m.optimize(max_iters=int(max_iters/5)) print(m) #Test @@ -150,7 +150,7 @@ def sparse_toy_linear_1d_classification(num_inducing=10, seed=default_seed, opti print m return m -def toy_heaviside(seed=default_seed, optimize=True, plot=True): +def toy_heaviside(seed=default_seed, max_iters=100, optimize=True, plot=True): """ Simple 1D classification example using a heavy side gp transformation @@ -166,16 +166,18 @@ def toy_heaviside(seed=default_seed, optimize=True, plot=True): Y[Y.flatten() == -1] = 0 # Model definition - noise_model = GPy.likelihoods.bernoulli(GPy.likelihoods.noise_models.gp_transformations.Heaviside()) - likelihood = GPy.likelihoods.EP(Y, noise_model) - m = GPy.models.GPClassification(data['X'], likelihood=likelihood) + kernel = GPy.kern.RBF(1) + likelihood = GPy.likelihoods.Bernoulli(gp_link=GPy.likelihoods.link_functions.Heaviside()) + ep = GPy.inference.latent_function_inference.expectation_propagation.EP() + m = GPy.core.GP(X=data['X'], Y=Y, kernel=kernel, likelihood=likelihood, inference_method=ep, name='gp_classification_heaviside') + #m = GPy.models.GPClassification(data['X'], likelihood=likelihood) # Optimize if optimize: - m.update_likelihood_approximation() # Parameters optimization: - m.optimize() - #m.pseudo_EM() + for _ in range(5): + m.optimize(max_iters=int(max_iters/5)) + print m # Plot if plot: diff --git a/GPy/examples/non_gaussian.py b/GPy/examples/non_gaussian.py index 1e2be93b..57c841e4 100644 --- a/GPy/examples/non_gaussian.py +++ b/GPy/examples/non_gaussian.py @@ -59,7 +59,7 @@ def student_t_approx(optimize=True, plot=True): t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd) laplace_inf = GPy.inference.latent_function_inference.Laplace() m3 = GPy.core.GP(X, Y.copy(), kernel3, likelihood=t_distribution, inference_method=laplace_inf) - m3['.*t_noise'].constrain_bounded(1e-6, 10.) + m3['.*t_scale2'].constrain_bounded(1e-6, 10.) m3['.*white'].constrain_fixed(1e-5) m3.randomize() @@ -67,7 +67,7 @@ def student_t_approx(optimize=True, plot=True): t_distribution = GPy.likelihoods.StudentT(deg_free=deg_free, sigma2=edited_real_sd) laplace_inf = GPy.inference.latent_function_inference.Laplace() m4 = GPy.core.GP(X, Yc.copy(), kernel4, likelihood=t_distribution, inference_method=laplace_inf) - m4['.*t_noise'].constrain_bounded(1e-6, 10.) + m4['.*t_scale2'].constrain_bounded(1e-6, 10.) m4['.*white'].constrain_fixed(1e-5) m4.randomize() print m4 @@ -124,6 +124,7 @@ def student_t_approx(optimize=True, plot=True): return m1, m2, m3, m4 def boston_example(optimize=True, plot=True): + raise NotImplementedError("Needs updating") import sklearn from sklearn.cross_validation import KFold optimizer='bfgs' @@ -152,8 +153,8 @@ def boston_example(optimize=True, plot=True): noise = 1e-1 #np.exp(-2) rbf_len = 0.5 data_axis_plot = 4 - kernelstu = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1]) - kernelgp = GPy.kern.rbf(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1]) + kernelstu = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1]) + kernelgp = GPy.kern.RBF(X.shape[1]) + GPy.kern.white(X.shape[1]) + GPy.kern.bias(X.shape[1]) #Baseline score_folds[0, n] = rmse(Y_test, np.mean(Y_train)) @@ -162,8 +163,8 @@ def boston_example(optimize=True, plot=True): print "Gauss GP" mgp = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelgp.copy()) mgp.constrain_fixed('.*white', 1e-5) - mgp['rbf_len'] = rbf_len - mgp['noise'] = noise + mgp['.*len'] = rbf_len + mgp['.*noise'] = noise print mgp if optimize: mgp.optimize(optimizer=optimizer, messages=messages) @@ -198,9 +199,9 @@ def boston_example(optimize=True, plot=True): stu_t_likelihood = GPy.likelihoods.Laplace(Y_train.copy(), t_distribution) mstu_t = GPy.models.GPRegression(X_train.copy(), Y_train.copy(), kernel=kernelstu.copy(), likelihood=stu_t_likelihood) mstu_t.constrain_fixed('.*white', 1e-5) - mstu_t.constrain_bounded('.*t_noise', 0.0001, 1000) + mstu_t.constrain_bounded('.*t_scale2', 0.0001, 1000) mstu_t['rbf_len'] = rbf_len - mstu_t['.*t_noise'] = noise + mstu_t['.*t_scale2'] = noise print mstu_t if optimize: mstu_t.optimize(optimizer=optimizer, messages=messages) diff --git a/GPy/examples/stochastic.py b/GPy/examples/stochastic.py deleted file mode 100644 index cc365cae..00000000 --- a/GPy/examples/stochastic.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -try: - import pylab as pb -except: - pass -import numpy as np -import GPy - -def toy_1d(optimize=True, plot=True): - N = 2000 - M = 20 - - #create data - X = np.linspace(0,32,N)[:,None] - Z = np.linspace(0,32,M)[:,None] - Y = np.sin(X) + np.cos(0.3*X) + np.random.randn(*X.shape)/np.sqrt(50.) - - m = GPy.models.SVIGPRegression(X,Y, batchsize=10, Z=Z) - m.constrain_bounded('noise_variance',1e-3,1e-1) - m.constrain_bounded('white_variance',1e-3,1e-1) - - m.param_steplength = 1e-4 - - if plot: - fig = pb.figure() - ax = fig.add_subplot(111) - def cb(foo): - ax.cla() - m.plot(ax=ax,Z_height=-3) - ax.set_ylim(-3,3) - fig.canvas.draw() - - if optimize: - m.optimize(500, callback=cb, callback_interval=1) - - if plot: - m.plot_traces() - return m diff --git a/GPy/examples/tutorials.py b/GPy/examples/tutorials.py deleted file mode 100644 index aa82d9f9..00000000 --- a/GPy/examples/tutorials.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - - -""" -Code of Tutorials -""" - -try: - import pylab as pb - pb.ion() -except: - pass -import numpy as np -import GPy - -def tuto_GP_regression(optimize=True, plot=True): - """The detailed explanations of the commands used in this file can be found in the tutorial section""" - - X = np.random.uniform(-3.,3.,(20,1)) - Y = np.sin(X) + np.random.randn(20,1)*0.05 - - kernel = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.) - - m = GPy.models.GPRegression(X, Y, kernel) - - print m - if plot: - m.plot() - - m.constrain_positive('') - - m.unconstrain('') # may be used to remove the previous constrains - m.constrain_positive('.*rbf_variance') - m.constrain_bounded('.*lengthscale',1.,10. ) - m.constrain_fixed('.*noise',0.0025) - - if optimize: - m.optimize() - m.optimize_restarts(num_restarts = 10) - - ####################################################### - ####################################################### - # sample inputs and outputs - X = np.random.uniform(-3.,3.,(50,2)) - Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(50,1)*0.05 - - # define kernel - ker = GPy.kern.Matern52(2,ARD=True) + GPy.kern.white(2) - - # create simple GP model - m = GPy.models.GPRegression(X, Y, ker) - - # contrain all parameters to be positive - m.constrain_positive('') - - # optimize and plot - if optimize: - m.optimize('tnc', max_f_eval = 1000) - if plot: - m.plot() - - print m - return(m) - -def tuto_kernel_overview(optimize=True, plot=True): - """The detailed explanations of the commands used in this file can be found in the tutorial section""" - ker1 = GPy.kern.rbf(1) # Equivalent to ker1 = GPy.kern.rbf(input_dim=1, variance=1., lengthscale=1.) - ker2 = GPy.kern.rbf(input_dim=1, variance = .75, lengthscale=2.) - ker3 = GPy.kern.rbf(1, .5, .5) - - print ker2 - - if plot: - ker1.plot() - ker2.plot() - ker3.plot() - - k1 = GPy.kern.rbf(1,1.,2.) - k2 = GPy.kern.Matern32(1, 0.5, 0.2) - - # Product of kernels - k_prod = k1.prod(k2) # By default, tensor=False - k_prodtens = k1.prod(k2,tensor=True) - - # Sum of kernels - k_add = k1.add(k2) # By default, tensor=False - k_addtens = k1.add(k2,tensor=True) - - k1 = GPy.kern.rbf(1,1.,2) - k2 = GPy.kern.periodic_Matern52(1,variance=1e3, lengthscale=1, period = 1.5, lower=-5., upper = 5) - - k = k1 * k2 # equivalent to k = k1.prod(k2) - print k - - # Simulate sample paths - X = np.linspace(-5,5,501)[:,None] - Y = np.random.multivariate_normal(np.zeros(501),k.K(X),1) - - k1 = GPy.kern.rbf(1) - k2 = GPy.kern.Matern32(1) - k3 = GPy.kern.white(1) - - k = k1 + k2 + k3 - print k - - k.constrain_positive('.*var') - k.constrain_fixed(np.array([1]),1.75) - k.tie_params('.*len') - k.unconstrain('white') - k.constrain_bounded('white',lower=1e-5,upper=.5) - print k - - k_cst = GPy.kern.bias(1,variance=1.) - k_mat = GPy.kern.Matern52(1,variance=1., lengthscale=3) - Kanova = (k_cst + k_mat).prod(k_cst + k_mat,tensor=True) - print Kanova - - # sample inputs and outputs - X = np.random.uniform(-3.,3.,(40,2)) - Y = 0.5*X[:,:1] + 0.5*X[:,1:] + 2*np.sin(X[:,:1]) * np.sin(X[:,1:]) - - # Create GP regression model - m = GPy.models.GPRegression(X, Y, Kanova) - - if plot: - fig = pb.figure(figsize=(5,5)) - ax = fig.add_subplot(111) - m.plot(ax=ax) - - pb.figure(figsize=(20,3)) - pb.subplots_adjust(wspace=0.5) - axs = pb.subplot(1,5,1) - m.plot(ax=axs) - pb.subplot(1,5,2) - pb.ylabel("= ",rotation='horizontal',fontsize='30') - axs = pb.subplot(1,5,3) - m.plot(ax=axs, which_parts=[False,True,False,False]) - pb.ylabel("cst +",rotation='horizontal',fontsize='30') - axs = pb.subplot(1,5,4) - m.plot(ax=axs, which_parts=[False,False,True,False]) - pb.ylabel("+ ",rotation='horizontal',fontsize='30') - axs = pb.subplot(1,5,5) - pb.ylabel("+ ",rotation='horizontal',fontsize='30') - m.plot(ax=axs, which_parts=[False,False,False,True]) - - return(m) - - -def model_interaction(optimize=True, plot=True): - X = np.random.randn(20,1) - Y = np.sin(X) + np.random.randn(*X.shape)*0.01 + 5. - k = GPy.kern.rbf(1) + GPy.kern.bias(1) - m = GPy.models.GPRegression(X, Y, kernel=k) - return m - diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py index 6302a590..f6a24c86 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py @@ -7,193 +7,392 @@ The package for the psi statistics computation import numpy as np -def psicomputations(variance, lengthscale, Z, variational_posterior): - """ - Z - MxQ - mu - NxQ - S - NxQ - gamma - NxQ - """ - # here are the "statistics" for psi0, psi1 and psi2 - # Produced intermediate results: - # _psi1 NxM - mu = variational_posterior.mean - S = variational_posterior.variance - gamma = variational_posterior.binary_prob +try: + from scipy import weave + + def _psicomputations(variance, lengthscale, Z, variational_posterior): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi0, psi1 and psi2 + # Produced intermediate results: + # _psi1 NxM + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + + N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1] + l2 = np.square(lengthscale) + log_denom1 = np.log(S/l2+1) + log_denom2 = np.log(2*S/l2+1) + log_gamma = np.log(gamma) + log_gamma1 = np.log(1.-gamma) + variance = float(variance) + psi0 = np.empty(N) + psi0[:] = variance + psi1 = np.empty((N,M)) + psi2n = np.empty((N,M,M)) + + from ....util.misc import param_to_array + S = param_to_array(S) + mu = param_to_array(mu) + gamma = param_to_array(gamma) + Z = param_to_array(Z) + + support_code = """ + #include + """ + code = """ + for(int n=0; npsi1_exp2)?psi1_exp1+log1p(exp(psi1_exp2-psi1_exp1)):psi1_exp2+log1p(exp(psi1_exp1-psi1_exp2)); + } + // Compute Psi_2 + double muZhat = mu(n,q) - (Zm1q+Zm2q)/2.; + double Z2 = Zm1q*Zm1q+ Zm2q*Zm2q; + double dZ = Zm1q - Zm2q; + + double psi2_exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q); + double psi2_exp2 = log_gamma1(n,q) - Z2/(2.*lq); + log_psi2_n += (psi2_exp1>psi2_exp2)?psi2_exp1+log1p(exp(psi2_exp2-psi2_exp1)):psi2_exp2+log1p(exp(psi2_exp1-psi2_exp2)); + } + double exp_psi2_n = exp(log_psi2_n); + psi2n(n,m1,m2) = variance*variance*exp_psi2_n; + if(m1!=m2) { psi2n(n,m2,m1) = variance*variance*exp_psi2_n;} + } + psi1(n,m1) = variance*exp(log_psi1); + } + } + """ + weave.inline(code, support_code=support_code, arg_names=['psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1'], type_converters=weave.converters.blitz) + + psi2 = psi2n.sum(axis=0) + return psi0,psi1,psi2,psi2n + + from GPy.util.caching import Cacher + psicomputations = Cacher(_psicomputations, limit=1) + + def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + ARD = (len(lengthscale)!=1) + + _,psi1,_,psi2n = psicomputations(variance, lengthscale, Z, variational_posterior) + + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + N,M,Q = mu.shape[0],Z.shape[0],mu.shape[1] + l2 = np.square(lengthscale) + log_denom1 = np.log(S/l2+1) + log_denom2 = np.log(2*S/l2+1) + log_gamma = np.log(gamma) + log_gamma1 = np.log(1.-gamma) + variance = float(variance) + + dvar = np.zeros(1) + dmu = np.zeros((N,Q)) + dS = np.zeros((N,Q)) + dgamma = np.zeros((N,Q)) + dl = np.zeros(Q) + dZ = np.zeros((M,Q)) + dvar += np.sum(dL_dpsi0) + + from ....util.misc import param_to_array + S = param_to_array(S) + mu = param_to_array(mu) + gamma = param_to_array(gamma) + Z = param_to_array(Z) + + support_code = """ + #include + """ + code = """ + for(int n=0; nexp2) { + d_exp1 = 1.; + d_exp2 = exp(exp2-exp1); + } else { + d_exp1 = exp(exp1-exp2); + d_exp2 = 1.; + } + double exp_sum = d_exp1+d_exp2; + + dmu(n,q) += lpsi1*Zmu*d_exp1/(denom*exp_sum); + dS(n,q) += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum)/2.; + dgamma(n,q) += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum; + dl(q) += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zm1q*Zm1q/(lq*lq)*d_exp2)/(2.*exp_sum); + dZ(m1,q) += lpsi1*(-Zmu/denom*d_exp1-Zm1q/lq*d_exp2)/exp_sum; + } + // Compute Psi_2 + double lpsi2 = psi2n(n,m1,m2)*dL_dpsi2(m1,m2); + if(q==0) {dvar(0) += lpsi2*2/variance;} + + double dZm1m2 = Zm1q - Zm2q; + double Z2 = Zm1q*Zm1q+Zm2q*Zm2q; + double muZhat = mu_nq - (Zm1q + Zm2q)/2.; + double denom = 2.*Snq+lq; + double muZhat2_denom = muZhat*muZhat/denom; + + double exp1 = dZm1m2*dZm1m2/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2(n,q)/2. + log_gamma(n,q); + double exp2 = log_gamma1(n,q) - Z2/(2.*lq); + double d_exp1,d_exp2; + if(exp1>exp2) { + d_exp1 = 1.; + d_exp2 = exp(exp2-exp1); + } else { + d_exp1 = exp(exp1-exp2); + d_exp2 = 1.; + } + double exp_sum = d_exp1+d_exp2; + + dmu(n,q) += -2.*lpsi2*muZhat/denom*d_exp1/exp_sum; + dS(n,q) += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum; + dgamma(n,q) += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum; + dl(q) += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZm1m2*dZm1m2/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum; + dZ(m1,q) += 2.*lpsi2*((muZhat/denom-dZm1m2/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum; + } + } + } + } + """ + weave.inline(code, support_code=support_code, arg_names=['dL_dpsi1','dL_dpsi2','psi1','psi2n','N','M','Q','variance','l2','Z','mu','S','gamma','log_denom1','log_denom2','log_gamma','log_gamma1','dvar','dl','dmu','dS','dgamma','dZ'], type_converters=weave.converters.blitz) + + dl *= 2.*lengthscale + if not ARD: + dl = dl.sum() + + return dvar, dl, dZ, dmu, dS, dgamma + +except: + + def psicomputations(variance, lengthscale, Z, variational_posterior): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi0, psi1 and psi2 + # Produced intermediate results: + # _psi1 NxM + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + + psi0 = np.empty(mu.shape[0]) + psi0[:] = variance + psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma) + psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma) + return psi0, psi1, psi2 - psi0 = np.empty(mu.shape[0]) - psi0[:] = variance - psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma) - psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma) - return psi0, psi1, psi2 - -def _psi1computations(variance, lengthscale, Z, mu, S, gamma): - """ - Z - MxQ - mu - NxQ - S - NxQ - gamma - NxQ - """ - # here are the "statistics" for psi1 - # Produced intermediate results: - # _psi1 NxM - - lengthscale2 = np.square(lengthscale) - - # psi1 - _psi1_denom = S[:, None, :] / lengthscale2 + 1. # Nx1xQ - _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ - _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ - _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ - _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ - _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ - _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ - _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) - _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ - _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM - _psi1 = variance * np.exp(_psi1_exp_sum) # NxM - - return _psi1 - -def _psi2computations(variance, lengthscale, Z, mu, S, gamma): - """ - Z - MxQ - mu - NxQ - S - NxQ - gamma - NxQ - """ - # here are the "statistics" for psi2 - # Produced intermediate results: - # _psi2 MxM + def _psi1computations(variance, lengthscale, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 + # Produced intermediate results: + # _psi1 NxM - lengthscale2 = np.square(lengthscale) + lengthscale2 = np.square(lengthscale) - _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q - _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q - _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q - _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ - - # psi2 - _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ - _psi2_denom_sqrt = np.sqrt(_psi2_denom) - _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q - _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom) - _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ - _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q - _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ - _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) - _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) - _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM - _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM - - return _psi2 - -def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): - ARD = (len(lengthscale)!=1) + # psi1 + _psi1_denom = S[:, None, :] / lengthscale2 + 1. # Nx1xQ + _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ + _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ + _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ + _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ + _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ + _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ + _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) + _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ + _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM + _psi1 = variance * np.exp(_psi1_exp_sum) # NxM - dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - - dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 + return _psi1 - dL_dlengscale = dl_psi1 + dl_psi2 - if not ARD: - dL_dlengscale = dL_dlengscale.sum() - - dL_dgamma = dgamma_psi1 + dgamma_psi2 - dL_dmu = dmu_psi1 + dmu_psi2 - dL_dS = dS_psi1 + dS_psi2 - dL_dZ = dZ_psi1 + dZ_psi2 + def _psi2computations(variance, lengthscale, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi2 + # Produced intermediate results: + # _psi2 MxM + + lengthscale2 = np.square(lengthscale) + + _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q + _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q + _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q + _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ - return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma - -def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma): - """ - dL_dpsi1 - NxM - Z - MxQ - mu - NxQ - S - NxQ - gamma - NxQ - """ - # here are the "statistics" for psi1 - # Produced intermediate results: dL_dparams w.r.t. psi1 - # _dL_dvariance 1 - # _dL_dlengthscale Q - # _dL_dZ MxQ - # _dL_dgamma NxQ - # _dL_dmu NxQ - # _dL_dS NxQ + # psi2 + _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ + _psi2_denom_sqrt = np.sqrt(_psi2_denom) + _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q + _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom) + _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ + _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q + _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ + _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) + _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) + _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM + _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM - lengthscale2 = np.square(lengthscale) - - # psi1 - _psi1_denom = S / lengthscale2 + 1. # NxQ - _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ - _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ - _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ - _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ - _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ - _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ - _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) - _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ - _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM - _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ - _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ - _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ - _psi1 = variance * np.exp(_psi1_exp_sum) # NxM - _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1 - _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ - _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common) # NxQ - _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2. # NxQ - _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z)) - _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z)) - - return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma - -def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): - """ - Z - MxQ - mu - NxQ - S - NxQ - gamma - NxQ - dL_dpsi2 - MxM - """ - # here are the "statistics" for psi2 - # Produced the derivatives w.r.t. psi2: - # _dL_dvariance 1 - # _dL_dlengthscale Q - # _dL_dZ MxQ - # _dL_dgamma NxQ - # _dL_dmu NxQ - # _dL_dS NxQ + return _psi2 - lengthscale2 = np.square(lengthscale) + def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + ARD = (len(lengthscale)!=1) + + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + + dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 + + dL_dlengscale = dl_psi1 + dl_psi2 + if not ARD: + dL_dlengscale = dL_dlengscale.sum() + + dL_dgamma = dgamma_psi1 + dgamma_psi2 + dL_dmu = dmu_psi1 + dmu_psi2 + dL_dS = dS_psi1 + dS_psi2 + dL_dZ = dZ_psi1 + dZ_psi2 + + return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma - _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q - _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q - _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q - _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ - - # psi2 - _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ - _psi2_denom_sqrt = np.sqrt(_psi2_denom) - _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q - _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:]) - _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ - _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q - _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ - _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) - _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) - _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM - _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ - _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ - _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ - _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM - _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance - _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z)) - _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq) - _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq) - _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z)) - _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)) - - return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma + def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma): + """ + dL_dpsi1 - NxM + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 + # Produced intermediate results: dL_dparams w.r.t. psi1 + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + + # psi1 + _psi1_denom = S / lengthscale2 + 1. # NxQ + _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ + _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ + _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ + _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ + _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ + _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ + _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) + _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ + _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM + _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ + _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ + _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ + _psi1 = variance * np.exp(_psi1_exp_sum) # NxM + _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1 + _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ + _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common) # NxQ + _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2. # NxQ + _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z)) + _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z)) + + return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma + + def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + dL_dpsi2 - MxM + """ + # here are the "statistics" for psi2 + # Produced the derivatives w.r.t. psi2: + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + + _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q + _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q + _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q + _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ + + # psi2 + _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ + _psi2_denom_sqrt = np.sqrt(_psi2_denom) + _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q + _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:]) + _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ + _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q + _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ + _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) + _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) + _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM + _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ + _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ + _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ + _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM + _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance + _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z)) + _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq) + _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq) + _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z)) + _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)) + + return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index cce26783..aa3a6478 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -4,7 +4,6 @@ from gp_regression import GPRegression from gp_classification import GPClassification from sparse_gp_regression import SparseGPRegression, SparseGPRegressionUncertainInput -from svigp_regression import SVIGPRegression from sparse_gp_classification import SparseGPClassification from gplvm import GPLVM from bcgplvm import BCGPLVM diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index d363fb7a..fca97e96 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -25,8 +25,6 @@ class BayesianGPLVM(SparseGP_MPI): Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', mpi_comm=None, normalizer=None, missing_data=False, stochastic=False, batchsize=1): - self.mpi_comm = mpi_comm - self.__IN_OPTIMIZATION__ = False self.logger = logging.getLogger(self.__class__.__name__) if X is None: diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py index 5a56bb7d..49c3914c 100644 --- a/GPy/models/sparse_gp_regression.py +++ b/GPy/models/sparse_gp_regression.py @@ -9,6 +9,7 @@ from .. import likelihoods from .. import kern from ..inference.latent_function_inference import VarDTC from ..core.parameterization.variational import NormalPosterior +from GPy.inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch class SparseGPRegression(SparseGP_MPI): """ @@ -30,7 +31,7 @@ class SparseGPRegression(SparseGP_MPI): """ - def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, X_variance=None, normalizer=None): + def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, X_variance=None, normalizer=None, mpi_comm=None): num_data, input_dim = X.shape # kern defaults to rbf (plus white for stability) @@ -48,8 +49,14 @@ class SparseGPRegression(SparseGP_MPI): if not (X_variance is None): X = NormalPosterior(X,X_variance) + + if mpi_comm is not None: + from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch + infr = VarDTC_minibatch(mpi_comm=mpi_comm) + else: + infr = VarDTC() - SparseGP_MPI.__init__(self, X, Y, Z, kernel, likelihood, inference_method=VarDTC(), normalizer=normalizer) + SparseGP_MPI.__init__(self, X, Y, Z, kernel, likelihood, inference_method=infr, normalizer=normalizer, mpi_comm=mpi_comm) def parameters_changed(self): from ..inference.latent_function_inference.var_dtc_parallel import update_gradients_sparsegp,VarDTC_minibatch diff --git a/GPy/models/svigp_regression.py b/GPy/models/svigp_regression.py deleted file mode 100644 index 3397e31e..00000000 --- a/GPy/models/svigp_regression.py +++ /dev/null @@ -1,45 +0,0 @@ -# Copyright (c) 2012, James Hensman -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -import numpy as np -from ..core import SVIGP -from .. import likelihoods -from .. import kern - -class SVIGPRegression(SVIGP): - """ - Gaussian Process model for regression - - This is a thin wrapper around the SVIGP class, with a set of sensible defalts - - :param X: input observations - :param Y: observed values - :param kernel: a GPy kernel, defaults to rbf+white - :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) - :type normalize_X: False|True - :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) - :type normalize_Y: False|True - :rtype: model object - - .. Note:: Multiple independent outputs are allowed using columns of Y - - """ - - def __init__(self, X, Y, kernel=None, Z=None, num_inducing=10, q_u=None, batchsize=10, normalize_Y=False): - # kern defaults to rbf (plus white for stability) - if kernel is None: - kernel = kern.rbf(X.shape[1], variance=1., lengthscale=4.) + kern.white(X.shape[1], 1e-3) - - # Z defaults to a subset of the data - if Z is None: - i = np.random.permutation(X.shape[0])[:num_inducing] - Z = X[i].copy() - else: - assert Z.shape[1] == X.shape[1] - - # likelihood defaults to Gaussian - likelihood = likelihoods.Gaussian(Y, normalize=normalize_Y) - - SVIGP.__init__(self, X, likelihood, kernel, Z, q_u=q_u, batchsize=batchsize) - self.load_batch() - diff --git a/GPy/plotting/matplot_dep/__init__.py b/GPy/plotting/matplot_dep/__init__.py index f493513a..4c4402ce 100644 --- a/GPy/plotting/matplot_dep/__init__.py +++ b/GPy/plotting/matplot_dep/__init__.py @@ -6,7 +6,6 @@ import models_plots import priors_plots import variational_plots import kernel_plots -import svig_plots import dim_reduction_plots import mapping_plots import Tango diff --git a/GPy/testing/mpi_tests.py b/GPy/testing/mpi_tests.py index 4848a6ec..45777eb1 100644 --- a/GPy/testing/mpi_tests.py +++ b/GPy/testing/mpi_tests.py @@ -21,6 +21,7 @@ comm = MPI.COMM_WORLD N = 100 x = np.linspace(-6., 6., N) y = np.sin(x) + np.random.randn(N) * 0.05 +comm.Bcast(y) data = np.vstack([x,y]) infr = GPy.inference.latent_function_inference.VarDTC_minibatch(mpi_comm=comm) m = GPy.models.BayesianGPLVM(data.T,1,mpi_comm=comm) @@ -39,9 +40,43 @@ if comm.rank==0: (stdout, stderr) = p.communicate() L1 = float(stdout.splitlines()[-2]) L2 = float(stdout.splitlines()[-1]) - self.assertAlmostEqual(L1, L2) + self.assertTrue(np.allclose(L1,L2)) import os os.remove('mpi_test__.py') + + def test_SparseGPRegression_MPI(self): + code = """ +import numpy as np +import GPy +from mpi4py import MPI +np.random.seed(123456) +comm = MPI.COMM_WORLD +N = 100 +x = np.linspace(-6., 6., N) +y = np.sin(x) + np.random.randn(N) * 0.05 +comm.Bcast(y) +data = np.vstack([x,y]) +#infr = GPy.inference.latent_function_inference.VarDTC_minibatch(mpi_comm=comm) +m = GPy.models.SparseGPRegression(data[:1].T,data[1:2].T,mpi_comm=comm) +m.optimize(max_iters=10) +if comm.rank==0: + print float(m.objective_function()) + m.inference_method.mpi_comm=None + m.mpi_comm=None + m._trigger_params_changed() + print float(m.objective_function()) + """ + with open('mpi_test__.py','w') as f: + f.write(code) + f.close() + p = subprocess.Popen('mpirun -n 4 python mpi_test__.py',stdout=subprocess.PIPE,shell=True) + (stdout, stderr) = p.communicate() + L1 = float(stdout.splitlines()[-2]) + L2 = float(stdout.splitlines()[-1]) + self.assertTrue(np.allclose(L1,L2)) + import os + os.remove('mpi_test__.py') + except: pass diff --git a/doc/index.rst b/doc/index.rst index c00f31d3..fec4aef1 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -5,23 +5,28 @@ Welcome to GPy's documentation! =============================== -For a quick start, you can have a look at one of the tutorials: -* `Basic Gaussian process regression `_ -* `Interacting with models `_ -* `A kernel overview `_ -* `Writing new kernels `_ -* `Writing new models `_ -* `Parameterization handles `_ +`GPy `_ is a Gaussian Process (GP) framework written in Python, from the Sheffield machine learning group. -You may also be interested by some examples in the GPy/examples folder. +The `GPy homepage `_ contains tutorials for users and further information on the project, including installation instructions. +This documentation is mostly aimed at developers interacting closely with the code-base. + +The code can be found on our `Github project page `_. It is open source and provided under the BSD license. + +.. * `Basic Gaussian process regression `_ +.. * `Interacting with models `_ +.. * `A kernel overview `_ +.. * `Writing new kernels `_ +.. * `Writing new models `_ +.. * `Parameterization handles `_ + +.. You may also be interested by some examples in the GPy/examples folder. Contents: .. toctree:: :maxdepth: 2 - installation GPy diff --git a/doc/log.txt b/doc/log.txt index 02532259..1b563edd 100644 --- a/doc/log.txt +++ b/doc/log.txt @@ -1,7 +1,7 @@ /home/alans/Work/GPy/GPy/__init__.py:docstring of GPy.load:1: WARNING: Inline interpreted text or phrase reference start-string without end-string. -/home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP:7: WARNING: Field list ends without a blank line; unexpected unindent. -/home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP:10: ERROR: Unexpected indentation. +/home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP.optimize:8: ERROR: Unknown interpreted text role "module". /home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP.predictive_gradients:5: ERROR: Unexpected indentation. +/home/alans/Work/GPy/GPy/core/gp.py:docstring of GPy.core.gp.GP.predictive_gradients:8: WARNING: Block quote ends without a blank line; unexpected unindent. /home/alans/Work/GPy/GPy/core/model.py:docstring of GPy.core.model.Model.optimize_restarts:29: WARNING: Explicit markup ends without a blank line; unexpected unindent. /home/alans/Work/GPy/doc/GPy.core.rst:65: WARNING: autodoc: failed to import module u'GPy.core.symbolic'; the following exception was raised: Traceback (most recent call last): @@ -23,31 +23,6 @@ ImportError: No module named lambdify /home/alans/Work/GPy/GPy/core/parameterization/ties_and_remappings.py:docstring of GPy.core.parameterization.ties_and_remappings.Tie:18: SEVERE: Unexpected section title or transition. ================================ -/home/alans/Work/GPy/doc/GPy.examples.rst:18: WARNING: autodoc: failed to import module u'GPy.examples.coreg_example'; the following exception was raised: -Traceback (most recent call last): - File "/home/alans/anaconda/envs/GPy/lib/python2.7/site-packages/sphinx/ext/autodoc.py", line 335, in import_object - __import__(self.modname) - File "/home/alans/Work/GPy/GPy/examples/coreg_example.py", line 44, in - m = GPy.models.SparseGPCoregionalizedRegression(X_list=[X1], Y_list=[Y1], Z_list = [Z1], kernel=kern) - File "/home/alans/Work/GPy/GPy/core/parameterization/parameterized.py", line 19, in __call__ - self = super(ParametersChangedMeta, self).__call__(*args, **kw) - File "/home/alans/Work/GPy/GPy/models/sparse_gp_coregionalized_regression.py", line 66, in __init__ - self['.*inducing'][:,-1].fix() - File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 444, in constrain_fixed - self.notify_observers(self, None if trigger_parent else -np.inf) - File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 148, in notify_observers - [callble(self, which=which) for _, _, callble in self.observers] - File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 1077, in _pass_through_notify_observers - self.notify_observers(which=which) - File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 148, in notify_observers - [callble(self, which=which) for _, _, callble in self.observers] - File "/home/alans/Work/GPy/GPy/core/parameterization/parameter_core.py", line 1075, in _parameters_changed_notification - self.parameters_changed() - File "/home/alans/Work/GPy/GPy/core/sparse_gp.py", line 66, in parameters_changed - self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata) - File "/home/alans/Work/GPy/GPy/inference/latent_function_inference/var_dtc.py", line 170, in inference - import ipdb; ipdb.set_trace() -ImportError: No module named ipdb /home/alans/Work/GPy/GPy/kern/_src/coregionalize.py:docstring of GPy.kern._src.coregionalize.Coregionalize:5: ERROR: Unexpected indentation. /home/alans/Work/GPy/doc/GPy.kern._src.rst:73: WARNING: autodoc: failed to import module u'GPy.kern._src.hierarchical'; the following exception was raised: Traceback (most recent call last): @@ -187,6 +162,7 @@ Parameter handles :py:class:`~GPy.core.parameterization.param.Param` =========== +/home/alans/Work/GPy/doc/installation.rst:: WARNING: document isn't included in any toctree /home/alans/Work/GPy/doc/kernel_implementation.rst:: WARNING: document isn't included in any toctree /home/alans/Work/GPy/doc/modules.rst:: WARNING: document isn't included in any toctree /home/alans/Work/GPy/doc/tuto_GP_regression.rst:: WARNING: document isn't included in any toctree