diff --git a/GPy/core/model.py b/GPy/core/model.py index 21bcf0c7..6514d73a 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -20,14 +20,13 @@ class Model(Parameterized): self.optimization_runs = [] self.sampling_runs = [] self.preferred_optimizer = 'scg' - # self._set_params(self._get_params()) has been taken out as it should only be called on leaf nodes + def log_likelihood(self): raise NotImplementedError, "this needs to be implemented to use the model class" + def _log_likelihood_gradients(self): - # def dK_d(self, param, dL_dK, X, X2) g = np.zeros(self.size) try: - # [g.__setitem__(s, self.gradient_mapping[p]().flat) for p, s in itertools.izip(self._parameters_, self._param_slices_) if not p.is_fixed] [p._collect_gradient(g[s]) for p, s in itertools.izip(self._parameters_, self._param_slices_) if not p.is_fixed] except ValueError: raise ValueError, 'Gradient for {} not defined, please specify gradients for parameters to optimize'.format(p.name) @@ -61,85 +60,6 @@ class Model(Parameterized): self.priors = state.pop() Parameterized._setstate(self, state) -# def set_prior(self, regexp, what): -# """ -# -# Sets priors on the model parameters. -# -# **Notes** -# -# Asserts that the prior is suitable for the constraint. If the -# wrong constraint is in place, an error is raised. If no -# constraint is in place, one is added (warning printed). -# -# For tied parameters, the prior will only be "counted" once, thus -# a prior object is only inserted on the first tied index -# -# :param regexp: regular expression of parameters on which priors need to be set. -# :type param: string, regexp, or integer array -# :param what: prior to set on parameter. -# :type what: GPy.core.Prior type -# -# """ -# if self.priors is None: -# self.priors = [None for i in range(self._get_params().size)] -# -# which = self.grep_param_names(regexp) -# -# # check tied situation -# tie_partial_matches = [tie for tie in self.tied_indices if (not set(tie).isdisjoint(set(which))) & (not set(tie) == set(which))] -# if len(tie_partial_matches): -# raise ValueError, "cannot place prior across partial ties" -# tie_matches = [tie for tie in self.tied_indices if set(which) == set(tie) ] -# if len(tie_matches) > 1: -# raise ValueError, "cannot place prior across multiple ties" -# elif len(tie_matches) == 1: -# which = which[:1] # just place a prior object on the first parameter -# -# -# # check constraints are okay -# -# if what.domain is _POSITIVE: -# constrained_positive_indices = [i for i, t in zip(self.constrained_indices, self.constraints) if t.domain is _POSITIVE] -# if len(constrained_positive_indices): -# constrained_positive_indices = np.hstack(constrained_positive_indices) -# else: -# constrained_positive_indices = np.zeros(shape=(0,)) -# bad_constraints = np.setdiff1d(self.all_constrained_indices(), constrained_positive_indices) -# assert not np.any(which[:, None] == bad_constraints), "constraint and prior incompatible" -# unconst = np.setdiff1d(which, constrained_positive_indices) -# if len(unconst): -# print "Warning: constraining parameters to be positive:" -# print '\n'.join([n for i, n in enumerate(self._get_param_names()) if i in unconst]) -# print '\n' -# self.constrain_positive(unconst) -# elif what.domain is _REAL: -# assert not np.any(which[:, None] == self.all_constrained_indices()), "constraint and prior incompatible" -# else: -# raise ValueError, "prior not recognised" -# -# # store the prior in a local list -# for w in which: -# self.priors[w] = what - - def get_gradient(self, name, return_names=False): - """ - Get model gradient(s) by name. The name is applied as a regular expression and all parameters that match that regular expression are returned. - - :param name: the name of parameters required (as a regular expression). - :type name: regular expression - :param return_names: whether or not to return the names matched (default False) - :type return_names: bool - """ - matches = self.grep_param_names(name) - if len(matches): - if return_names: - return self._log_likelihood_gradients()[matches], np.asarray(self._get_param_names())[matches].tolist() - else: - return self._log_likelihood_gradients()[matches] - else: - raise AttributeError, "no parameter matches %s" % name - def randomize(self): """ Randomize the model. @@ -183,7 +103,9 @@ class Model(Parameterized): :param messages: whether to display during optimisation :type messages: bool - .. note:: If num_processes is None, the number of workes in the multiprocessing pool is automatically set to the number of processors on the current machine. + .. note:: If num_processes is None, the number of workes in the + multiprocessing pool is automatically set to the number of processors + on the current machine. """ initial_parameters = self._get_params_transformed() @@ -227,45 +149,23 @@ class Model(Parameterized): self._set_params_transformed(initial_parameters) def ensure_default_constraints(self, warning=True): - """ + """ Ensure that any variables which should clearly be positive have been constrained somehow. The method performs a regular expression search on parameter names looking for the terms 'variance', 'lengthscale', 'precision' and 'kappa'. If any of these terms are present in the name the parameter is constrained positive. + + DEPRECATED. """ raise DeprecationWarning, 'parameters now have default constraints' - #positive_strings = ['variance', 'lengthscale', 'precision', 'kappa', 'sensitivity'] - # param_names = self._get_param_names() - -# for s in positive_strings: -# paramlist = self.grep_param_names(".*"+s) -# for param in paramlist: -# for p in param.flattened_parameters: -# rav_i = set(self._raveled_index_for(p)) -# for constraint in self.constraints.iter_properties(): -# rav_i -= set(self._constraint_indices(p, constraint)) -# rav_i -= set(np.nonzero(self._fixes_for(p)!=UNFIXED)[0]) -# ind = self._backtranslate_index(p, np.array(list(rav_i), dtype=int)) -# if ind.size != 0: -# p[np.unravel_index(ind, p.shape)].constrain_positive(warning=warning) -# if paramlist: -# self.__getitem__(None, paramlist).constrain_positive(warning=warning) -# currently_constrained = self.all_constrained_indices() -# to_make_positive = [] -# for s in positive_strings: -# for i in self.grep_param_names(".*" + s): -# if not (i in currently_constrained): -# to_make_positive.append(i) -# if len(to_make_positive): -# self.constrain_positive(np.asarray(to_make_positive), warning=warning) def objective_function(self, x): """ The objective function passed to the optimizer. It combines the likelihood and the priors. - + Failures are handled robustly. The algorithm will try several times to return the objective, and will raise the original exception if the objective cannot be computed. @@ -336,7 +236,9 @@ class Model(Parameterized): :messages: whether to display during optimisation :type messages: bool :param optimizer: which optimizer to use (defaults to self.preferred optimizer) - :type optimizer: string TODO: valid strings? + :type optimizer: string + + TODO: valid args """ if optimizer is None: optimizer = self.preferred_optimizer @@ -389,15 +291,15 @@ class Model(Parameterized): indices = np.r_[:self.size] which = (transformed_index[:,None]==indices[self._fixes_][None,:]).nonzero() transformed_index = (indices-(~self._fixes_).cumsum())[transformed_index[which[0]]] - + if transformed_index.size == 0: print "No free parameters to check" return - + # just check the global ratio dx = np.zeros_like(x) dx[transformed_index] = step * np.sign(np.random.uniform(-1, 1, transformed_index.size)) - + # evaulate around the point x f1 = self.objective_function(x + dx) f2 = self.objective_function(x - dx) @@ -405,7 +307,7 @@ class Model(Parameterized): dx = dx[transformed_index] gradient = gradient[transformed_index] - + numerical_gradient = (f1 - f2) / (2 * dx) global_ratio = (f1 - f2) / (2 * np.dot(dx, np.where(gradient == 0, 1e-32, gradient))) return (np.abs(1. - global_ratio) < tolerance) or (np.abs(gradient - numerical_gradient).mean() < tolerance) @@ -439,7 +341,7 @@ class Model(Parameterized): #print param_index, transformed_index else: transformed_index = param_index - + if param_index.size == 0: print "No free parameters to check" return @@ -472,82 +374,5 @@ class Model(Parameterized): print grad_string self._set_params_transformed(x) return ret - - def input_sensitivity(self): - """ - return an array describing the sesitivity of the model to each input - NB. Right now, we're basing this on the lengthscales (or - variances) of the kernel. TODO: proper sensitivity analysis - where we integrate across the model inputs and evaluate the - effect on the variance of the model output. """ - if not hasattr(self, 'kern'): - raise ValueError, "this model has no kernel" - - k = self.kern#[p for p in self.kern._parameters_ if hasattr(p, "ARD") and p.ARD] - from ..kern import RBF, Linear#, RBFInv - - if isinstance(k, RBF): - return 1. / k.lengthscale - #elif isinstance(k, RBFInv): - # return k.inv_lengthscale - elif isinstance(k, Linear): - return k.variances - else: - raise ValueError, "cannot determine sensitivity for this kernel" - - def pseudo_EM(self, stop_crit=.1, **kwargs): - """ - EM - like algorithm for Expectation Propagation and Laplace approximation - - :param stop_crit: convergence criterion - :type stop_crit: float - - .. Note: kwargs are passed to update_likelihood and optimize functions. - """ - assert isinstance(self.likelihood, likelihoods.EP) or isinstance(self.likelihood, likelihoods.EP_Mixed_Noise), "pseudo_EM is only available for EP likelihoods" - ll_change = stop_crit + 1. - iteration = 0 - last_ll = -np.inf - - convergence = False - alpha = 0 - stop = False - - # Handle **kwargs - ep_args = {} - for arg in kwargs.keys(): - if arg in ('epsilon', 'power_ep'): - ep_args[arg] = kwargs[arg] - del kwargs[arg] - - while not stop: - last_approximation = self.likelihood.copy() - last_params = self._get_params() - if len(ep_args) == 2: - self.update_likelihood_approximation(epsilon=ep_args['epsilon'], power_ep=ep_args['power_ep']) - elif len(ep_args) == 1: - if ep_args.keys()[0] == 'epsilon': - self.update_likelihood_approximation(epsilon=ep_args['epsilon']) - elif ep_args.keys()[0] == 'power_ep': - self.update_likelihood_approximation(power_ep=ep_args['power_ep']) - else: - self.update_likelihood_approximation() - new_ll = self.log_likelihood() - ll_change = new_ll - last_ll - - if ll_change < 0: - self.likelihood = last_approximation # restore previous likelihood approximation - self._set_params(last_params) # restore model parameters - print "Log-likelihood decrement: %s \nLast likelihood update discarded." % ll_change - stop = True - else: - self.optimize(**kwargs) - last_ll = self.log_likelihood() - if ll_change < stop_crit: - stop = True - iteration += 1 - if stop: - print "%s iterations." % iteration - self.update_likelihood_approximation() diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index ccbc76d5..8eac69da 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -15,7 +15,7 @@ __precision__ = numpy.get_printoptions()['precision'] # numpy printing precision __print_threshold__ = 5 ###### -class Param(Constrainable, ObservableArray, Gradcheckable, Indexable): +class Param(Constrainable, ObservableArray, Gradcheckable): """ Parameter object for GPy models. diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py index 51194865..36291ca3 100644 --- a/GPy/core/parameterization/transformations.py +++ b/GPy/core/parameterization/transformations.py @@ -4,10 +4,10 @@ import numpy as np from domains import _POSITIVE,_NEGATIVE, _BOUNDED -import sys import weakref -_lim_val = -np.log(sys.float_info.epsilon) +_exp_lim_val = np.finfo(np.float64).max +_lim_val = np.log(_exp_lim_val)#-np.log(sys.float_info.epsilon) #=============================================================================== # Fixing constants @@ -34,6 +34,16 @@ class Transformation(object): def initialize(self, f): """ produce a sensible initial value for f(x)""" raise NotImplementedError + def plot(self, xlabel=r'transformed $\theta$', ylabel=r'$\theta$', axes=None, *args,**kw): + import sys + assert "matplotlib" in sys.modules, "matplotlib package has not been imported." + import matplotlib.pyplot as plt + from ...plotting.matplot_dep import base_plots + x = np.linspace(-8,8) + base_plots.meanplot(x, self.f(x),axes=axes*args,**kw) + axes = plt.gca() + axes.set_xlabel(xlabel) + axes.set_ylabel(ylabel) def __str__(self): raise NotImplementedError def __repr__(self): @@ -54,6 +64,24 @@ class Logexp(Transformation): return np.abs(f) def __str__(self): return '+ve' + + +class LogexpNeg(Transformation): + domain = _POSITIVE + def f(self, x): + return np.where(x>_lim_val, -x, -np.log(1. + np.exp(np.clip(x, -np.inf, _lim_val)))) + #raises overflow warning: return np.where(x>_lim_val, x, np.log(1. + np.exp(x))) + def finv(self, f): + return np.where(f>_lim_val, 0, np.log(np.exp(-f) - 1.)) + def gradfactor(self, f): + return np.where(f>_lim_val, -1, -1 + np.exp(-f)) + def initialize(self, f): + if np.any(f < 0.): + print "Warning: changing parameters to satisfy constraints" + return np.abs(f) + def __str__(self): + return '+ve' + class NegativeLogexp(Transformation): domain = _NEGATIVE diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index 00a80c7b..15a4e1f8 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -2,7 +2,6 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -from ..util.linalg import mdot from gp import GP from parameterization.param import Param from ..inference.latent_function_inference import var_dtc @@ -32,7 +31,7 @@ class SparseGP(GP): """ - def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp'): + def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, X_variance=None, name='sparse gp'): #pick a sensible inference method if inference_method is None: @@ -58,11 +57,33 @@ class SparseGP(GP): self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y) self.likelihood.update_gradients(self.grad_dict.pop('partial_for_likelihood')) if isinstance(self.X, VariationalPosterior): - self.kern.update_gradients_variational(posterior_variational=self.X, Z=self.Z, **self.grad_dict) - self.Z.gradient = self.kern.gradients_Z_variational(posterior_variational=self.X, Z=self.Z, **self.grad_dict) + #gradients wrt kernel + dL_dKmm = self.grad_dict.pop('dL_dKmm') + self.kern.update_gradients_full(dL_dKmm, self.Z, None) + target = np.zeros(self.kern.size) + self.kern._collect_gradient(target) + self.kern.update_gradients_expectations(variational_posterior=self.X, Z=self.Z, **self.grad_dict) + self.kern._collect_gradient(target) + self.kern._set_gradient(target) + + #gradients wrt Z + self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z) + self.Z.gradient += self.kern.gradients_Z_expectations( + self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X) else: - self.kern.update_gradients_sparse(X=self.X, Z=self.Z, **self.grad_dict) - self.Z.gradient = self.kern.gradients_Z_sparse(X=self.X, Z=self.Z, **self.grad_dict) + #gradients wrt kernel + target = np.zeros(self.kern.size) + self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X) + self.kern._collect_gradient(target) + self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z) + self.kern._collect_gradient(target) + self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None) + self.kern._collect_gradient(target) + self.kern._set_gradient(target) + + #gradients wrt Z + self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X) def _raw_predict(self, Xnew, X_variance_new=None, full_cov=False): """ diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index a2686d73..2044f08d 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -16,16 +16,16 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan output_dim = 1 input_dim = 3 else: - input_dim = 1 + input_dim = 2 output_dim = output_dim # generate GPLVM-like data X = _np.random.rand(num_inputs, input_dim) - #lengthscales = _np.random.rand(input_dim) - #k = (GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True) + lengthscales = _np.random.rand(input_dim) + k = GPy.kern.RBF(input_dim, .5, lengthscales, ARD=True) ##+ GPy.kern.white(input_dim, 0.01) #) - k = GPy.kern.Linear(input_dim, ARD=1)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001) + #k = GPy.kern.Linear(input_dim, ARD=1)# + GPy.kern.bias(input_dim) + GPy.kern.white(input_dim, 0.00001) K = k.K(X) Y = _np.random.multivariate_normal(_np.zeros(num_inputs), K, (output_dim,)).T @@ -164,7 +164,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, _np.random.seed(0) data = GPy.util.datasets.oil() - kernel = GPy.kern.RBF(Q, 1., _np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2)) + kernel = GPy.kern.RBF(Q, 1., 1./_np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2)) Y = data['X'][:N] m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k) m.data_labels = data['Y'][:N].argmax(axis=1) @@ -270,10 +270,11 @@ def bgplvm_simulation(optimize=True, verbose=1, from GPy import kern from GPy.models import BayesianGPLVM - D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 3, 10 + D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 5, 9 _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) Y = Ylist[0] k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q) + #k = kern.RBF(Q, ARD=True, lengthscale=10.) m = BayesianGPLVM(Y, Q, init="PCA", num_inducing=num_inducing, kernel=k) if optimize: @@ -281,7 +282,7 @@ def bgplvm_simulation(optimize=True, verbose=1, m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05) if plot: - m.q.plot("BGPLVM Latent Space 1D") + m.X.plot("BGPLVM Latent Space 1D") m.kern.plot_ARD('BGPLVM Simulation ARD Parameters') return m @@ -293,7 +294,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1, from GPy.models import BayesianGPLVM from GPy.inference.latent_function_inference.var_dtc import VarDTCMissingData - D1, D2, D3, N, num_inducing, Q = 15, 5, 8, 30, 5, 9 + D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 5, 9 _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) Y = Ylist[0] k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q) @@ -302,7 +303,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1, m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, kernel=k) m.inference_method = VarDTCMissingData() m.Y[inan] = _np.nan - m.q.variance *= .1 + m.X.variance *= .1 m.parameters_changed() m.Yreal = Y @@ -311,7 +312,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1, m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05) if plot: - m.q.plot("BGPLVM Latent Space 1D") + m.X.plot("BGPLVM Latent Space 1D") m.kern.plot_ARD('BGPLVM Simulation ARD Parameters') return m diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py index ef4484bd..fec61204 100644 --- a/GPy/inference/latent_function_inference/var_dtc.py +++ b/GPy/inference/latent_function_inference/var_dtc.py @@ -204,15 +204,14 @@ class VarDTCMissingData(object): def inference(self, kern, X, Z, likelihood, Y): if isinstance(X, VariationalPosterior): uncertain_inputs = True - psi0 = kern.psi0(Z, X) - psi1 = kern.psi1(Z, X) - psi2 = kern.psi2(Z, X) + psi0_all = kern.psi0(Z, X) + psi1_all = kern.psi1(Z, X) + psi2_all = kern.psi2(Z, X) else: uncertain_inputs = False - psi0 = kern.Kdiag(X) - psi1 = kern.K(X, Z) - psi2 = None - + psi0_all = kern.Kdiag(X) + psi1_all = kern.K(X, Z) + psi2_all = None Ys, traces = self._Y(Y) beta_all = 1./likelihood.variance diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py index 1022124d..1466800c 100644 --- a/GPy/kern/_src/add.py +++ b/GPy/kern/_src/add.py @@ -101,7 +101,7 @@ class Add(Kern): raise NotImplementedError, "psi2 cannot be computed for this kernel" return psi2 - def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, variational_posterior, Z): + def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): from white import White from rbf import RBF #from rbf_inv import RBFInv @@ -124,10 +124,10 @@ class Add(Kern): eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z[:,is2], mu[:,is2], S[:,is2]) * 2. - p1.update_gradients_variational(dL_dKmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1]) + p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1]) - def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): + def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): from white import White from rbf import RBF #from rbf_inv import rbfinv @@ -151,10 +151,10 @@ class Add(Kern): eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z[:,is2], mu[:,is2], S[:,is2]) * 2. - target += p1.gradients_z_variational(dL_dKmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1]) + target += p1.gradients_z_variational(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], S[:,is1], Z[:,is1]) return target - def gradients_muS_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, mu, S, Z): + def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): from white import white from rbf import rbf #from rbf_inv import rbfinv @@ -179,7 +179,7 @@ class Add(Kern): eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(z[:,is2], mu[:,is2], s[:,is2]) * 2. - a, b = p1.gradients_muS_variational(dL_dkmm, dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1]) + a, b = p1.gradients_qX_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, mu[:,is1], s[:,is1], z[:,is1]) target_mu += a target_S += b return target_mu, target_S @@ -193,9 +193,9 @@ class Add(Kern): kernel_plots.plot(self,*args) def input_sensitivity(self): - in_sen = np.zeros((self.input_dim, self.num_params)) + in_sen = np.zeros((self.num_params, self.input_dim)) for i, [p, i_s] in enumerate(zip(self._parameters_, self.input_slices)): - in_sen[i_s, i] = p.input_sensitivity() + in_sen[i, i_s] = p.input_sensitivity() return in_sen def _getstate(self): diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 6b23a69e..d7d8f9ca 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -26,55 +26,63 @@ class Kern(Parameterized): raise NotImplementedError def Kdiag(self, Xa): raise NotImplementedError - def psi0(self,Z,variational_posterior): + def psi0(self, Z, variational_posterior): raise NotImplementedError - def psi1(self,Z,variational_posterior): + def psi1(self, Z, variational_posterior): raise NotImplementedError - def psi2(self,Z,variational_posterior): + def psi2(self, Z, variational_posterior): raise NotImplementedError def gradients_X(self, dL_dK, X, X2): raise NotImplementedError def gradients_X_diag(self, dL_dK, X): raise NotImplementedError + def update_gradients_full(self, dL_dK, X, X2): """Set the gradients of all parameters when doing full (N) inference.""" raise NotImplementedError - def update_gradients_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z): - target = np.zeros(self.size) - self.update_gradients_diag(dL_dKdiag, X) - self._collect_gradient(target) - self.update_gradients_full(dL_dKnm, X, Z) - self._collect_gradient(target) - self.update_gradients_full(dL_dKmm, Z, None) - self._collect_gradient(target) - self._set_gradient(target) - def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - """Set the gradients of all parameters when doing variational (M) inference with uncertain inputs.""" + def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + """ + Set the gradients of all parameters when doing inference with + uncertain inputs, using expectations of the kernel. + + The esential maths is + + dL_d{theta_i} = dL_dpsi0 * dpsi0_d{theta_i} + + dL_dpsi1 * dpsi1_d{theta_i} + + dL_dpsi2 * dpsi2_d{theta_i} + """ raise NotImplementedError - def gradients_Z_sparse(self, dL_dKmm, dL_dKnm, dL_dKdiag, X, Z): - grad = self.gradients_X(dL_dKmm, Z) - grad += self.gradients_X(dL_dKnm.T, Z, X) - return grad - def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + + def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + """ + Returns the derivative of the objective wrt Z, using the chain rule + through the expectation variables. + """ raise NotImplementedError - def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + + def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + """ + Compute the gradients wrt the parameters of the variational + distruibution q(X), chain-ruling via the expectations of the kernel + """ raise NotImplementedError - + def plot_ARD(self, *args, **kw): - if "matplotlib" in sys.modules: - from ...plotting.matplot_dep import kernel_plots - self.plot_ARD.__doc__ += kernel_plots.plot_ARD.__doc__ + """ + See :class:`~GPy.plotting.matplot_dep.kernel_plots` + """ + import sys assert "matplotlib" in sys.modules, "matplotlib package has not been imported." from ...plotting.matplot_dep import kernel_plots return kernel_plots.plot_ARD(self,*args,**kw) - + def input_sensitivity(self): """ Returns the sensitivity for each dimension of this kernel. """ return np.zeros(self.input_dim) - + def __add__(self, other): """ Overloading of the '+' operator. for more control, see self.add """ return self.add(other) @@ -113,7 +121,8 @@ class Kern(Parameterized): def prod(self, other, tensor=False): """ - Multiply two kernels (either on the same space, or on the tensor product of the input space). + Multiply two kernels (either on the same space, or on the tensor + product of the input space). :param other: the other kernel to be added :type other: GPy.kern diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py index 1d4f4611..44f71d85 100644 --- a/GPy/kern/_src/linear.py +++ b/GPy/kern/_src/linear.py @@ -22,22 +22,25 @@ class Linear(Kern): :param input_dim: the number of input dimensions :type input_dim: int :param variances: the vector of variances :math:`\sigma^2_i` - :type variances: array or list of the appropriate size (or float if there is only one variance parameter) - :param ARD: Auto Relevance Determination. If equal to "False", the kernel has only one variance parameter \sigma^2, otherwise there is one variance parameter per dimension. + :type variances: array or list of the appropriate size (or float if there + is only one variance parameter) + :param ARD: Auto Relevance Determination. If False, the kernel has only one + variance parameter \sigma^2, otherwise there is one variance + parameter per dimension. :type ARD: Boolean :rtype: kernel object + """ def __init__(self, input_dim, variances=None, ARD=False, name='linear'): super(Linear, self).__init__(input_dim, name) self.ARD = ARD - if ARD == False: + if not ARD: if variances is not None: variances = np.asarray(variances) assert variances.size == 1, "Only one variance needed for non-ARD kernel" else: variances = np.ones(1) - self._Xcache, self._X2cache = np.empty(shape=(2,)) else: if variances is not None: variances = np.asarray(variances) @@ -103,7 +106,6 @@ class Linear(Kern): #---------------------------------------# # PSI statistics # - # variational # #---------------------------------------# def psi0(self, Z, variational_posterior): @@ -117,33 +119,26 @@ class Linear(Kern): ZAinner = self._ZAinner(variational_posterior, Z) return np.dot(ZAinner, ZA.T) - def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, variational_posterior, Z): - mu, S = variational_posterior.mean, variational_posterior.variance + def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + #psi1 + self.update_gradients_full(dL_dpsi1, variational_posterior.mean, Z) # psi0: tmp = dL_dpsi0[:, None] * self._mu2S(variational_posterior) - if self.ARD: grad = tmp.sum(0) - else: grad = np.atleast_1d(tmp.sum()) - #psi1 - self.update_gradients_full(dL_dpsi1, mu, Z) - grad += self.variances.gradient + if self.ARD: self.variances.gradient += tmp.sum(0) + else: self.variances.gradient += tmp.sum() #psi2 tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(variational_posterior, Z)[:, :, None, :] * (2. * Z)[None, None, :, :]) - if self.ARD: grad += tmp.sum(0).sum(0).sum(0) - else: grad += tmp.sum() - #from Kmm - self.update_gradients_full(dL_dKmm, Z, None) - self.variances.gradient += grad + if self.ARD: self.variances.gradient += tmp.sum(0).sum(0).sum(0) + else: self.variances.gradient += tmp.sum() - def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, variational_posterior, Z): - # Kmm - grad = self.gradients_X(dL_dKmm, Z, None) + def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): #psi1 - grad += self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean) + grad = self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean) #psi2 self._weave_dpsi2_dZ(dL_dpsi2, Z, variational_posterior, grad) return grad - def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, variational_posterior, Z): + def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape) # psi0 grad_mu += dL_dpsi0[:, None] * (2.0 * variational_posterior.mean * self.variances) @@ -160,7 +155,7 @@ class Linear(Kern): #--------------------------------------------------# - def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, pv, target_mu, target_S): + def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, vp, target_mu, target_S): # Think N,num_inducing,num_inducing,input_dim ZA = Z * self.variances AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :] @@ -203,16 +198,15 @@ class Linear(Kern): weave_options = {'headers' : [''], 'extra_compile_args': ['-fopenmp -O3'], #-march=native'], 'extra_link_args' : ['-lgomp']} - - mu = pv.mean + mu = vp.mean N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu) weave.inline(code, support_code=support_code, libraries=['gomp'], arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'], type_converters=weave.converters.blitz,**weave_options) - def _weave_dpsi2_dZ(self, dL_dpsi2, Z, pv, target): - AZA = self.variances*self._ZAinner(pv, Z) + def _weave_dpsi2_dZ(self, dL_dpsi2, Z, vp, target): + AZA = self.variances*self._ZAinner(vp, Z) code=""" int n,m,mm,q; #pragma omp parallel for private(n,mm,q) @@ -234,23 +228,23 @@ class Linear(Kern): 'extra_compile_args': ['-fopenmp -O3'], #-march=native'], 'extra_link_args' : ['-lgomp']} - N,num_inducing,input_dim = pv.mean.shape[0],Z.shape[0],pv.mean.shape[1] - mu = param_to_array(pv.mean) + N,num_inducing,input_dim = vp.mean.shape[0],Z.shape[0],vp.mean.shape[1] + mu = param_to_array(vp.mean) weave.inline(code, support_code=support_code, libraries=['gomp'], arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'], type_converters=weave.converters.blitz,**weave_options) - def _mu2S(self, pv): - return np.square(pv.mean) + pv.variance + def _mu2S(self, vp): + return np.square(vp.mean) + vp.variance - def _ZAinner(self, pv, Z): + def _ZAinner(self, vp, Z): ZA = Z*self.variances - inner = (pv.mean[:, None, :] * pv.mean[:, :, None]) - diag_indices = np.diag_indices(pv.mean.shape[1], 2) - inner[:, diag_indices[0], diag_indices[1]] += pv.variance + inner = (vp.mean[:, None, :] * vp.mean[:, :, None]) + diag_indices = np.diag_indices(vp.mean.shape[1], 2) + inner[:, diag_indices[0], diag_indices[1]] += vp.variance - return np.dot(ZA, inner).swapaxes(0, 1) # NOTE: self.ZAinner \in [num_inducing x N x input_dim]! + return np.dot(ZA, inner).swapaxes(0, 1) # NOTE: self.ZAinner \in [num_inducing x num_data x input_dim]! def input_sensitivity(self): if self.ARD: return self.variances diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index c80fb646..3bf355be 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -35,92 +35,80 @@ class RBF(Stationary): # PSI statistics # #---------------------------------------# - def parameters_changed(self): - # reset cached results - self._Z, self._mu, self._S = np.empty(shape=(3, 1)) # cached versions of Z,mu,S - - def psi0(self, Z, variational_posterior): return self.Kdiag(variational_posterior.mean) def psi1(self, Z, variational_posterior): - mu = variational_posterior.mean - S = variational_posterior.variance - self._psi_computations(Z, mu, S) - return self._psi1 + _, _, _, psi1 = self._psi1computations(Z, variational_posterior) + return psi1 def psi2(self, Z, variational_posterior): - mu = variational_posterior.mean - S = variational_posterior.variance - self._psi_computations(Z, mu, S) - return self._psi2 + _, _, _, _, _, psi2 = self._psi2computations(Z, variational_posterior) + return psi2 - def update_gradients_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - #contributions from Kmm - sself.update_gradients_full(dL_dKmm, Z) - - mu = variational_posterior.mean - S = variational_posterior.variance - self._psi_computations(Z, mu, S) + def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): l2 = self.lengthscale **2 #contributions from psi0: - self.variance.gradient += np.sum(dL_dpsi0) + self.variance.gradient = np.sum(dL_dpsi0) + self.lengthscale.gradient = 0. #from psi1 - self.variance.gradient += np.sum(dL_dpsi1 * self._psi1 / self.variance) - d_length = self._psi1[:,:,None] * ((self._psi1_dist_sq - 1.)/(self.lengthscale*self._psi1_denom) +1./self.lengthscale) + denom, _, dist_sq, psi1 = self._psi1computations(Z, variational_posterior) + d_length = psi1[:,:,None] * ((dist_sq - 1.)/(self.lengthscale*denom) +1./self.lengthscale) dpsi1_dlength = d_length * dL_dpsi1[:, :, None] if not self.ARD: self.lengthscale.gradient += dpsi1_dlength.sum() else: self.lengthscale.gradient += dpsi1_dlength.sum(0).sum(0) + self.variance.gradient += np.sum(dL_dpsi1 * psi1) / self.variance #from psi2 - d_var = 2.*self._psi2 / self.variance - d_length = 2.*self._psi2[:, :, :, None] * (self._psi2_Zdist_sq * self._psi2_denom + self._psi2_mudist_sq + S[:, None, None, :] / l2) / (self.lengthscale * self._psi2_denom) + S = variational_posterior.variance + denom, Zdist, Zdist_sq, mudist, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior) + d_length = 2.*psi2[:, :, :, None] * (Zdist_sq[None, :,:,:] * denom[:,None,None,:] + mudist_sq + S[:, None, None, :] / l2) / (self.lengthscale * denom[:,None,None,:]) + #TODO: combine denom and l2 as denom_l2?? + #TODO: tidy the above! + #TODO: tensordot below? - self.variance.gradient += np.sum(dL_dpsi2 * d_var) dpsi2_dlength = d_length * dL_dpsi2[:, :, :, None] if not self.ARD: self.lengthscale.gradient += dpsi2_dlength.sum() else: self.lengthscale.gradient += dpsi2_dlength.sum(0).sum(0).sum(0) - def gradients_Z_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - mu = variational_posterior.mean - S = variational_posterior.variance - self._psi_computations(Z, mu, S) + self.variance.gradient += 2.*np.sum(dL_dpsi2 * psi2)/self.variance + + def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): l2 = self.lengthscale **2 #psi1 - denominator = (l2 * (self._psi1_denom)) - dpsi1_dZ = -self._psi1[:, :, None] * ((self._psi1_dist / denominator)) + denom, dist, dist_sq, psi1 = self._psi1computations(Z, variational_posterior) + denominator = l2 * denom + dpsi1_dZ = -psi1[:, :, None] * (dist / denominator) grad = np.sum(dL_dpsi1[:, :, None] * dpsi1_dZ, 0) #psi2 - term1 = self._psi2_Zdist / l2 # num_inducing, num_inducing, input_dim - term2 = self._psi2_mudist / self._psi2_denom / l2 # N, num_inducing, num_inducing, input_dim - dZ = self._psi2[:, :, :, None] * (term1[None] + term2) + denom, Zdist, Zdist_sq, mudist, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior) + term1 = Zdist / l2 # M, M, Q + term2 = mudist / denom[:,None,None,:] / l2 # N, M, M, Q + dZ = psi2[:, :, :, None] * (term1[None, :, :, :] + term2) #N,M,M,Q grad += 2*(dL_dpsi2[:, :, :, None] * dZ).sum(0).sum(0) - grad += self.gradients_X(dL_dKmm, Z, None) - return grad - def gradients_q_variational(self, dL_dKmm, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - mu = variational_posterior.mean - S = variational_posterior.variance - self._psi_computations(Z, mu, S) + def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): l2 = self.lengthscale **2 #psi1 - tmp = self._psi1[:, :, None] / l2 / self._psi1_denom - grad_mu = np.sum(dL_dpsi1[:, :, None] * tmp * self._psi1_dist, 1) - grad_S = np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (self._psi1_dist_sq - 1), 1) + denom, dist, dist_sq, psi1 = self._psi1computations(Z, variational_posterior) + tmp = psi1[:, :, None] / l2 / denom + grad_mu = np.sum(dL_dpsi1[:, :, None] * tmp * dist, 1) + grad_S = np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (dist_sq - 1), 1) #psi2 - tmp = self._psi2[:, :, :, None] / l2 / self._psi2_denom - grad_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * self._psi2_mudist).sum(1).sum(1) - grad_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*self._psi2_mudist_sq - 1)).sum(1).sum(1) + denom, Zdist, Zdist_sq, mudist, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior) + tmp = psi2[:, :, :, None] / l2 / denom[:,None,None,:] + grad_mu += -2.*(dL_dpsi2[:, :, :, None] * tmp * mudist).sum(1).sum(1) + grad_S += (dL_dpsi2[:, :, :, None] * tmp * (2.*mudist_sq - 1)).sum(1).sum(1) return grad_mu, grad_S @@ -182,83 +170,72 @@ class RBF(Stationary): return target + #@cache_this TODO + def _psi1computations(self, Z, vp): + mu, S = vp.mean, vp.variance + l2 = self.lengthscale **2 + denom = S[:, None, :] / l2 + 1. # N,1,Q + dist = Z[None, :, :] - mu[:, None, :] # N,M,Q + dist_sq = np.square(dist) / l2 / denom # N,M,Q + exponent = -0.5 * np.sum(dist_sq + np.log(denom), -1)#N,M + psi1 = self.variance * np.exp(exponent) # N,M + return denom, dist, dist_sq, psi1 - def _psi_computations(self, Z, mu, S): - # here are the "statistics" for psi1 and psi2 - Z_changed = not fast_array_equal(Z, self._Z) - if Z_changed: - # Z has changed, compute Z specific stuff - self._psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q - self._psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q - self._psi2_Zdist_sq = np.square(self._psi2_Zdist / self.lengthscale) # M,M,Q - if Z_changed or not fast_array_equal(mu, self._mu) or not fast_array_equal(S, self._S): - # something's changed. recompute EVERYTHING - l2 = self.lengthscale **2 - # psi1 - self._psi1_denom = S[:, None, :] / l2 + 1. - self._psi1_dist = Z[None, :, :] - mu[:, None, :] - self._psi1_dist_sq = np.square(self._psi1_dist) / l2 / self._psi1_denom - self._psi1_exponent = -0.5 * np.sum(self._psi1_dist_sq + np.log(self._psi1_denom), -1) - self._psi1 = self.variance * np.exp(self._psi1_exponent) + #@cache_this TODO + def _psi2computations(self, Z, vp): + mu, S = vp.mean, vp.variance - # psi2 - self._psi2_denom = 2.*S[:, None, None, :] / l2 + 1. # N,M,M,Q - self._psi2_mudist, self._psi2_mudist_sq, self._psi2_exponent, _ = self.weave_psi2(mu, self._psi2_Zhat) - # self._psi2_mudist = mu[:,None,None,:]-self._psi2_Zhat #N,M,M,Q - # self._psi2_mudist_sq = np.square(self._psi2_mudist)/(l2*self._psi2_denom) - # self._psi2_exponent = np.sum(-self._psi2_Zdist_sq -self._psi2_mudist_sq -0.5*np.log(self._psi2_denom),-1) #N,M,M,Q - self._psi2 = np.square(self.variance) * np.exp(self._psi2_exponent) # N,M,M,Q + N, Q = mu.shape + M = Z.shape[0] - # store matrices for caching - self._Z, self._mu, self._S = Z, mu, S + #compute required distances + Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q + Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q + Zdist_sq = np.square(Zdist / self.lengthscale) # M,M,Q - def weave_psi2(self, mu, Zhat): - N, input_dim = mu.shape - num_inducing = Zhat.shape[0] + #allocate memory for the things we want to compute + mudist = np.empty((N, M, M, Q)) + mudist_sq = np.empty((N, M, M, Q)) + psi2 = np.empty((N, M, M)) - mudist = np.empty((N, num_inducing, num_inducing, input_dim)) - mudist_sq = np.empty((N, num_inducing, num_inducing, input_dim)) - psi2_exponent = np.zeros((N, num_inducing, num_inducing)) - psi2 = np.empty((N, num_inducing, num_inducing)) + l2 = self.lengthscale **2 + denom = 2.*S / l2 + 1. # N,Q + half_log_denom = 0.5 * np.log(denom) + denom_l2 = denom*l2 # TODO: Max and James: divide?? - psi2_Zdist_sq = self._psi2_Zdist_sq - _psi2_denom = self._psi2_denom.squeeze().reshape(N, self.input_dim) - half_log_psi2_denom = 0.5 * np.log(self._psi2_denom).squeeze().reshape(N, self.input_dim) - variance_sq = np.float64(np.square(self.variance)) - if self.ARD: - lengthscale2 = self.lengthscale **2 - else: - lengthscale2 = np.ones(input_dim) * self.lengthscale2**2 + variance_sq = float(np.square(self.variance)) code = """ double tmp; + double exponent; - #pragma omp parallel for private(tmp) + #pragma omp parallel for private(tmp, exponentgg) for (int n=0; n