diff --git a/GPy/inference/optimization/stochastics.py b/GPy/inference/optimization/stochastics.py index f1532bc5..2151c39c 100644 --- a/GPy/inference/optimization/stochastics.py +++ b/GPy/inference/optimization/stochastics.py @@ -5,6 +5,10 @@ class StochasticStorage(object): ''' This is a container for holding the stochastic parameters, such as subset indices or step length and so on. + + self.d has to be a list of lists: + [dimension indices, nan indices for those dimensions] + so that the minibatches can be used as efficiently as possible.10 ''' def __init__(self, model): """ @@ -28,9 +32,23 @@ class SparseGPMissing(StochasticStorage): """ Here we want to loop over all dimensions everytime. Thus, we can just make sure the loop goes over self.d every - time. + time. We will try to get batches which look the same together + which speeds up calculations significantly. """ - self.d = range(model.Y_normalized.shape[1]) + import numpy as np + self.Y = model.Y_normalized + bdict = {} + for d in range(self.Y.shape[1]): + inan = np.isnan(self.Y[:, d]) + arr_str = np.array2string(inan, + np.inf, 0, + True, '', + formatter={'bool':lambda x: '1' if x else '0'}) + try: + bdict[arr_str][0].append(d) + except: + bdict[arr_str] = [[d], ~inan] + self.d = bdict.values() class SparseGPStochastics(StochasticStorage): """ @@ -40,16 +58,29 @@ class SparseGPStochastics(StochasticStorage): def __init__(self, model, batchsize=1): self.batchsize = batchsize self.output_dim = model.Y.shape[1] + self.Y = model.Y_normalized self.reset() self.do_stochastics() def do_stochastics(self): if self.batchsize == 1: self.current_dim = (self.current_dim+1)%self.output_dim - self.d = [self.current_dim] + self.d = [[[self.current_dim], np.isnan(self.Y[:, self.d])]] else: import numpy as np self.d = np.random.choice(self.output_dim, size=self.batchsize, replace=False) + bdict = {} + for d in self.d: + inan = np.isnan(self.Y[:, d]) + arr_str = np.array2string(inan, + np.inf, 0, + True, '', + formatter={'bool':lambda x: '1' if x else '0'}) + try: + bdict[arr_str][0].append(d) + except: + bdict[arr_str] = [[d], ~inan] + self.d = bdict.values() def reset(self): self.current_dim = -1 diff --git a/GPy/models/sparse_gp_minibatch.py b/GPy/models/sparse_gp_minibatch.py index ad62043a..07295255 100644 --- a/GPy/models/sparse_gp_minibatch.py +++ b/GPy/models/sparse_gp_minibatch.py @@ -63,33 +63,18 @@ class SparseGPMiniBatch(SparseGP): if stochastic and missing_data: self.missing_data = True - self.ninan = ~np.isnan(Y) self.stochastics = SparseGPStochastics(self, batchsize) elif stochastic and not missing_data: self.missing_data = False self.stochastics = SparseGPStochastics(self, batchsize) elif missing_data: self.missing_data = True - self.ninan = ~np.isnan(Y) self.stochastics = SparseGPMissing(self) else: self.stochastics = False logger.info("Adding Z as parameter") self.link_parameter(self.Z, index=0) - if self.missing_data: - self.Ylist = [] - overall = self.Y_normalized.shape[1] - m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall) - message = m_f(-1) - print(message, end=' ') - for d in range(overall): - self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None]) - print(' '*(len(message)+1) + '\r', end=' ') - message = m_f(d) - print(message, end=' ') - print('') - self.posterior = None def has_uncertain_inputs(self): @@ -245,8 +230,7 @@ class SparseGPMiniBatch(SparseGP): message = m_f(-1) print(message, end=' ') - for d in self.stochastics.d: - ninan = self.ninan[:, d] + for d, ninan in self.stochastics.d: if not self.stochastics: print(' '*(len(message)) + '\r', end=' ') @@ -257,7 +241,7 @@ class SparseGPMiniBatch(SparseGP): grad_dict, current_values, value_indices = self._inner_parameters_changed( self.kern, self.X[ninan], self.Z, self.likelihood, - self.Ylist[d], self.Y_metadata, + self.Y_normalized[ninan][:, d], self.Y_metadata, Lm, dL_dKmm, subset_indices=dict(outputs=d, samples=ninan)) @@ -266,8 +250,8 @@ class SparseGPMiniBatch(SparseGP): Lm = posterior.K_chol dL_dKmm = grad_dict['dL_dKmm'] - woodbury_inv[:, :, d] = posterior.woodbury_inv - woodbury_vector[:, d:d+1] = posterior.woodbury_vector + woodbury_inv[:, :, d] = posterior.woodbury_inv[:,:,None] + woodbury_vector[:, d] = posterior.woodbury_vector self._log_marginal_likelihood += log_marginal_likelihood if not self.stochastics: print('')