From cb024f0bf834f707b3afa26c5fcdaa0a1f88144b Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 4 Sep 2015 10:33:32 +0100 Subject: [PATCH] [spgp minibatch] added new routine for psi NxMxM, much faster, little bigger mem footbprint --- GPy/inference/optimization/stochastics.py | 37 ++++---- GPy/kern/_src/kern.py | 18 +--- GPy/kern/_src/rbf.py | 5 +- GPy/models/bayesian_gplvm_minibatch.py | 78 ++++++---------- GPy/models/sparse_gp_minibatch.py | 109 ++++++++++------------ 5 files changed, 101 insertions(+), 146 deletions(-) diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py index f7ad1245..0fc488a2 100644 --- a/GPy/inference/optimization/stochastics.py +++ b/GPy/inference/optimization/stochastics.py @@ -43,7 +43,7 @@ class SparseGPMissing(StochasticStorage): np.set_printoptions(threshold='nan') for d in range(self.Y.shape[1]): inan = np.isnan(self.Y)[:, d] - arr_str = np.array2string(~inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'}) + arr_str = np.array2string(inan, np.inf, 0, True, '', formatter={'bool':lambda x: '1' if x else '0'}) try: bdict[arr_str][0].append(d) except: @@ -56,35 +56,36 @@ class SparseGPStochastics(StochasticStorage): For the sparse gp we need to store the dimension we are in, and the indices corresponding to those """ - def __init__(self, model, batchsize=1): + def __init__(self, model, batchsize=1, missing_data=True): self.batchsize = batchsize self.output_dim = model.Y.shape[1] self.Y = model.Y_normalized + self.missing_data = missing_data self.reset() self.do_stochastics() def do_stochastics(self): + import numpy as np if self.batchsize == 1: self.current_dim = (self.current_dim+1)%self.output_dim - self.d = [[[self.current_dim], np.isnan(self.Y[:, self.d])]] + self.d = [[[self.current_dim], np.isnan(self.Y[:, self.current_dim]) if self.missing_data else None]] else: - import numpy as np self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False) bdict = {} - opt = np.get_printoptions() - np.set_printoptions(threshold='nan') - for d in self.d: - inan = np.isnan(self.Y[:, d]) - arr_str = int(np.array2string(inan, - np.inf, 0, - True, '', - formatter={'bool':lambda x: '1' if x else '0'}), 2) - try: - bdict[arr_str][0].append(d) - except: - bdict[arr_str] = [[d], ~inan] - np.set_printoptions(**opt) - self.d = bdict.values() + if self.missing_data: + opt = np.get_printoptions() + np.set_printoptions(threshold='nan') + for d in self.d: + inan = np.isnan(self.Y[:, d]) + arr_str = np.array2string(inan,np.inf, 0,True, '',formatter={'bool':lambda x: '1' if x else '0'}) + try: + bdict[arr_str][0].append(d) + except: + bdict[arr_str] = [[d], ~inan] + np.set_printoptions(**opt) + self.d = bdict.values() + else: + self.d = [[self.d, None]] def reset(self): self.current_dim = -1 diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 77f4d563..f6971faf 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -58,24 +58,10 @@ class Kern(Parameterized): self._sliced_X = 0 self.useGPU = self._support_GPU and useGPU - self._return_psi2_n_flag = ObsAr(np.zeros(1).astype(bool)) from .psi_comp import PSICOMP_GH self.psicomp = PSICOMP_GH() - @property - def return_psi2_n(self): - """ - Flag whether to pass back psi2 as NxMxM or MxM, by summing out N. - """ - return self._return_psi2_n_flag[0] - @return_psi2_n.setter - def return_psi2_n(self, val): - def visit(self): - if isinstance(self, Kern): - self._return_psi2_n_flag[0]=val - self.traverse(visit) - @Cache_this(limit=20) def _slice_X(self, X): return X[:, self.active_dims] @@ -97,7 +83,9 @@ class Kern(Parameterized): def psi1(self, Z, variational_posterior): return self.psicomp.psicomputations(self, Z, variational_posterior)[1] def psi2(self, Z, variational_posterior): - return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2] + return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=False)[2] + def psi2n(self, Z, variational_posterior): + return self.psicomp.psicomputations(self, Z, variational_posterior, return_psi2_n=True)[2] def gradients_X(self, dL_dK, X, X2): raise NotImplementedError def gradients_X_diag(self, dL_dKdiag, X): diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 3ba782b1..145b9bfd 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -56,7 +56,10 @@ class RBF(Stationary): return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1] def psi2(self, Z, variational_posterior): - return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=self.return_psi2_n)[2] + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=False)[2] + + def psi2n(self, Z, variational_posterior): + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior, return_psi2_n=True)[2] def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior, psi0=None, psi1=None, psi2=None, Lpsi0=None, Lpsi1=None, Lpsi2=None): diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py index 5fd9af60..1d06f07f 100644 --- a/GPy/models/bayesian_gplvm_minibatch.py +++ b/GPy/models/bayesian_gplvm_minibatch.py @@ -81,38 +81,9 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch): """Get the gradients of the posterior distribution of X in its specific form.""" return X.mean.gradient, X.variance.gradient - def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None,psi0=None, psi1=None, psi2=None, **kw): + def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, psi0=None, psi1=None, psi2=None, **kw): posterior, log_marginal_likelihood, grad_dict = super(BayesianGPLVMMiniBatch, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2, **kw) - kl_fctr = self.kl_factr - if self.has_uncertain_inputs(): - if self.missing_data: - d = self.output_dim - log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X)/d - else: - log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(X) - - X.mean.gradient[:] = 0 - X.variance.gradient[:] = 0 - - self.variational_prior.update_gradients_KL(X) - if self.missing_data: - grad_dict['meangrad'] = kl_fctr*X.mean.gradient/d - grad_dict['vargrad'] = kl_fctr*X.variance.gradient/d - else: - grad_dict['meangrad'] = kl_fctr*X.mean.gradient - grad_dict['vargrad'] = kl_fctr*X.variance.gradient - - #if subset_indices is not None: - #value_indices['meangrad'] = subset_indices['samples'] - #value_indices['vargrad'] = subset_indices['samples'] - - #grad_dict['meangrad'] = kl_fctr*self.X.mean.gradient - #grad_dict['vargrad'] = kl_fctr*self.X.variance.gradient - #self.X.mean.gradient[:] = 0 - #self.X.variance.gradient[:] = 0 - #import ipdb; ipdb.set_trace() # XXX BREAKPOINT - return posterior, log_marginal_likelihood, grad_dict def _outer_values_update(self, full_values): @@ -127,36 +98,41 @@ class BayesianGPLVMMiniBatch(SparseGPMiniBatch): Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi2=full_values['dL_dpsi2'], - psi0=self.psi0, psi1=self.psi1, psi2=self.psi2, - Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2']) - full_values['meangrad'] += meangrad_tmp - full_values['vargrad'] += vargrad_tmp - else: - full_values['Xgrad'] = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z) - full_values['Xgrad'] += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X) + psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) + + kl_fctr = self.kl_factr + + self.X.mean.gradient[:] = 0 + self.X.variance.gradient[:] = 0 + self.variational_prior.update_gradients_KL(self.X) + + if self.missing_data or not self.stochastics: + self.X.mean.gradient = kl_fctr*self.X.mean.gradient + self.X.variance.gradient = kl_fctr*self.X.variance.gradient + else: + d = self.output_dim + self.X.mean.gradient = kl_fctr*self.X.mean.gradient*self.stochastics.batchsize/d + self.X.variance.gradient = kl_fctr*self.X.variance.gradient*self.stochastics.batchsize/d + self.X.mean.gradient += meangrad_tmp + self.X.variance.gradient += vargrad_tmp - if self.has_uncertain_inputs(): - self.X.mean.gradient = full_values['meangrad'] - self.X.variance.gradient = full_values['vargrad'] else: - self.X.gradient = full_values['Xgrad'] + self.X.gradient = self.kern.gradients_X(full_values['dL_dKnm'], self.X, self.Z) + self.X.gradient += self.kern.gradients_X_diag(full_values['dL_dKdiag'], self.X) def _outer_init_full_values(self): full_values = super(BayesianGPLVMMiniBatch, self)._outer_init_full_values() - full_values['meangrad'] = np.zeros((self.X.shape[0], self.X.shape[1])) - full_values['vargrad'] = np.zeros((self.X.shape[0], self.X.shape[1])) - - full_values['dL_dpsi0'] = ObsAr(np.zeros(self.X.shape[0])) - full_values['dL_dpsi1'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0]))) - full_values['dL_dpsi2'] = ObsAr(np.zeros((self.Z.shape[0], self.Z.shape[0]))) - - full_values['Lpsi0'] = ObsAr(np.zeros(self.X.shape[0])) - full_values['Lpsi1'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0]))) - full_values['Lpsi2'] = ObsAr(np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0]))) return full_values def parameters_changed(self): super(BayesianGPLVMMiniBatch,self).parameters_changed() + kl_fctr = self.kl_factr + if self.missing_data or not self.stochastics: + self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X) + elif self.stochastics: + d = self.output_dim + self._log_marginal_likelihood -= kl_fctr*self.variational_prior.KL_divergence(self.X)*self.stochastics.batchsize/d + if isinstance(self.inference_method, VarDTC_minibatch): return diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py index 92b2baf5..ecb53f0f 100644 --- a/GPy/models/sparse_gp_minibatch.py +++ b/GPy/models/sparse_gp_minibatch.py @@ -63,10 +63,10 @@ class SparseGPMiniBatch(SparseGP): if stochastic and missing_data: self.missing_data = True - self.stochastics = SparseGPStochastics(self, batchsize) + self.stochastics = SparseGPStochastics(self, batchsize, self.missing_data) elif stochastic and not missing_data: self.missing_data = False - self.stochastics = SparseGPStochastics(self, batchsize) + self.stochastics = SparseGPStochastics(self, batchsize, self.missing_data) elif missing_data: self.missing_data = True self.stochastics = SparseGPMissing(self) @@ -105,11 +105,6 @@ class SparseGPMiniBatch(SparseGP): psi2_sum_n = psi2.sum(axis=0) posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, psi0=psi0, psi1=psi1, psi2=psi2_sum_n, **kwargs) - - if self.has_uncertain_inputs(): - grad_dict['Lpsi0'] = grad_dict['dL_dpsi0']*psi0 - grad_dict['Lpsi1'] = grad_dict['dL_dpsi1']*psi1 - grad_dict['Lpsi2'] = grad_dict['dL_dpsi2']*psi2 return posterior, log_marginal_likelihood, grad_dict def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None): @@ -144,14 +139,10 @@ class SparseGPMiniBatch(SparseGP): else: index = slice(None) if key in full_values: - if np.isscalar(current_values[key]): + try: + full_values[key][index] += current_values[key] + except: full_values[key] += current_values[key] - else: - from ..core.parameterization.observable_array import ObsAr - if isinstance(full_values[key], ObsAr): - full_values[key].values[index] += current_values[key] - else: - full_values[key][index] += current_values[key] else: full_values[key] = current_values[key] @@ -174,43 +165,39 @@ class SparseGPMiniBatch(SparseGP): #gradients wrt kernel dL_dKmm = full_values['dL_dKmm'] self.kern.update_gradients_full(dL_dKmm, self.Z, None) - full_values['kerngrad'] = self.kern.gradient.copy() + kgrad = self.kern.gradient.copy() self.kern.update_gradients_expectations( variational_posterior=self.X, Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi2=full_values['dL_dpsi2'], - psi0=self.psi0, psi1=self.psi1, psi2=self.psi2, - Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2']) - full_values['kerngrad'] += self.kern.gradient + psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) + self.kern.gradient += kgrad + #gradients wrt Z - full_values['Zgrad'] = self.kern.gradients_X(dL_dKmm, self.Z) - full_values['Zgrad'] += self.kern.gradients_Z_expectations( + self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z) + self.Z.gradient += self.kern.gradients_Z_expectations( variational_posterior=self.X, Z=self.Z, dL_dpsi0=full_values['dL_dpsi0'], dL_dpsi1=full_values['dL_dpsi1'], dL_dpsi2=full_values['dL_dpsi2'], - psi0=self.psi0, psi1=self.psi1, psi2=self.psi2, - Lpsi0=full_values['Lpsi0'], Lpsi1=full_values['Lpsi1'], Lpsi2=full_values['Lpsi2']) + psi0=self.psi0, psi1=self.psi1, psi2=self.psi2) else: #gradients wrt kernel self.kern.update_gradients_diag(full_values['dL_dKdiag'], self.X) - full_values['kerngrad'] = self.kern.gradient.copy() + kgrad = self.kern.gradient.copy() self.kern.update_gradients_full(full_values['dL_dKnm'], self.X, self.Z) - full_values['kerngrad'] += self.kern.gradient + kgrad += self.kern.gradient self.kern.update_gradients_full(full_values['dL_dKmm'], self.Z, None) - full_values['kerngrad'] += self.kern.gradient + self.kern.gradient += kgrad + #kgrad += self.kern.gradient + #gradients wrt Z - full_values['Zgrad'] = self.kern.gradients_X(full_values['dL_dKmm'], self.Z) - full_values['Zgrad'] += self.kern.gradients_X(full_values['dL_dKnm'].T, self.Z, self.X) + self.Z.gradient = self.kern.gradients_X(full_values['dL_dKmm'], self.Z) + self.Z.gradient += self.kern.gradients_X(full_values['dL_dKnm'].T, self.Z, self.X) self.likelihood.update_gradients(full_values['dL_dthetaL']) - full_values['likgrad'] = self.likelihood.gradient.copy() - - self.likelihood.gradient = full_values['likgrad'] - self.kern.gradient = full_values['kerngrad'] - self.Z.gradient = full_values['Zgrad'] def _outer_init_full_values(self): """ @@ -225,8 +212,15 @@ class SparseGPMiniBatch(SparseGP): to initialize the gradients for the mean and the variance in order to have the full gradient for indexing) """ - return {'dL_dKdiag': np.zeros(self.X.shape[0]), - 'dL_dKnm': np.zeros((self.X.shape[0], self.Z.shape[0]))} + retd = dict(dL_dKmm=np.zeros((self.Z.shape[0], self.Z.shape[0]))) + if self.has_uncertain_inputs(): + retd.update(dict(dL_dpsi0=np.zeros(self.X.shape[0]), + dL_dpsi1=np.zeros((self.X.shape[0], self.Z.shape[0])), + dL_dpsi2=np.zeros((self.X.shape[0], self.Z.shape[0], self.Z.shape[0])))) + else: + retd.update({'dL_dKdiag': np.zeros(self.X.shape[0]), + 'dL_dKnm': np.zeros((self.X.shape[0], self.Z.shape[0]))}) + return retd def _outer_loop_for_missing_data(self): Lm = None @@ -247,35 +241,17 @@ class SparseGPMiniBatch(SparseGP): message = m_f(-1) print(message, end=' ') - #Compute the psi statistics for N once, but don't sum out N in psi2 - if self.has_uncertain_inputs(): - self.kern.return_psi2_n = True - from ..core.parameterization.observable_array import ObsAr - psi0 = ObsAr(self.kern.psi0(self.Z, self.X)) - psi1 = ObsAr(self.kern.psi1(self.Z, self.X)) - psi2 = ObsAr(self.kern.psi2(self.Z, self.X)) - else: - psi0 = self.kern.Kdiag(self.X) - psi1 = self.kern.K(self.X, self.Z) - psi2 = None - - self.psi0 = psi0 - self.psi1 = psi1 - self.psi2 = psi2 - for d, ninan in self.stochastics.d: if not self.stochastics: print(' '*(len(message)) + '\r', end=' ') message = m_f(d) print(message, end=' ') - psi0ni = psi0[ninan] - psi1ni = psi1[ninan] + psi0ni = self.psi0[ninan] + psi1ni = self.psi1[ninan] if self.has_uncertain_inputs(): - psi2ni = psi2[ninan] - #value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan) - value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, meangrad=ninan, vargrad=ninan, - Lpsi0=ninan, Lpsi1=ninan, Lpsi2=ninan) + psi2ni = self.psi2[ninan] + value_indices = dict(outputs=d, samples=ninan, dL_dpsi0=ninan, dL_dpsi1=ninan, dL_dpsi2=ninan) else: psi2ni = None value_indices = dict(outputs=d, samples=ninan, dL_dKdiag=ninan, dL_dKnm=ninan) @@ -290,7 +266,7 @@ class SparseGPMiniBatch(SparseGP): # Fill out the full values by adding in the apporpriate grad_dict # values self._inner_take_over_or_update(self.full_values, grad_dict, value_indices) - self._inner_values_update(grad_dict) # What is this for? + self._inner_values_update(grad_dict) # What is this for? -> MRD woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None] woodbury_vector[:, d] = posterior.woodbury_vector @@ -307,8 +283,6 @@ class SparseGPMiniBatch(SparseGP): self.kern.return_psi2_n = False def _outer_loop_without_missing_data(self): - self._log_marginal_likelihood = 0 - if self.posterior is None: woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim)) woodbury_vector = np.zeros((self.num_inducing, self.output_dim)) @@ -316,14 +290,14 @@ class SparseGPMiniBatch(SparseGP): woodbury_inv = self.posterior._woodbury_inv woodbury_vector = self.posterior._woodbury_vector - d = self.stochastics.d + d = self.stochastics.d[0][0] posterior, log_marginal_likelihood, grad_dict= self._inner_parameters_changed( self.kern, self.X, self.Z, self.likelihood, self.Y_normalized[:, d], self.Y_metadata) self.grad_dict = grad_dict - self._log_marginal_likelihood += log_marginal_likelihood + self._log_marginal_likelihood = log_marginal_likelihood self._outer_values_update(self.grad_dict) @@ -334,6 +308,19 @@ class SparseGPMiniBatch(SparseGP): K=posterior._K, mean=None, cov=None, K_chol=posterior.K_chol) def parameters_changed(self): + #Compute the psi statistics for N once, but don't sum out N in psi2 + if self.has_uncertain_inputs(): + #psi0 = ObsAr(self.kern.psi0(self.Z, self.X)) + #psi1 = ObsAr(self.kern.psi1(self.Z, self.X)) + #psi2 = ObsAr(self.kern.psi2(self.Z, self.X)) + self.psi0 = self.kern.psi0(self.Z, self.X) + self.psi1 = self.kern.psi1(self.Z, self.X) + self.psi2 = self.kern.psi2n(self.Z, self.X) + else: + self.psi0 = self.kern.Kdiag(self.X) + self.psi1 = self.kern.K(self.X, self.Z) + self.psi2 = None + if self.missing_data: self._outer_loop_for_missing_data() elif self.stochastics: