diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index 6b923609..ecc1e4ba 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -9,6 +9,7 @@ from .. import likelihoods from parameterization.variational import VariationalPosterior import logging +from GPy.inference.latent_function_inference.posterior import Posterior logger = logging.getLogger("sparse gp") class SparseGP(GP): @@ -34,12 +35,18 @@ class SparseGP(GP): """ - def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp', Y_metadata=None, normalizer=False): + def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, + name='sparse gp', Y_metadata=None, normalizer=False, + missing_data=False): + + self.missing_data = missing_data + if self.missing_data: + self.ninan = ~np.isnan(Y) #pick a sensible inference method if inference_method is None: if isinstance(likelihood, likelihoods.Gaussian): - inference_method = var_dtc.VarDTC() + inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1]) else: #inference_method = ?? raise NotImplementedError, "what to do what to do?" @@ -51,44 +58,200 @@ class SparseGP(GP): GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer) logger.info("Adding Z as parameter") self.link_parameter(self.Z, index=0) + if self.missing_data: + self.Ylist = [] + overall = self.Y_normalized.shape[1] + m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall) + message = m_f(-1) + print message, + for d in xrange(overall): + self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None]) + print ' '*(len(message)+1) + '\r', + message = m_f(d) + print message, + print '' + def has_uncertain_inputs(self): return isinstance(self.X, VariationalPosterior) - def parameters_changed(self): - self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata) - self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) - if isinstance(self.X, VariationalPosterior): + def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None): + """ + This is the standard part, which usually belongs in parameters_changed. + + For automatic handling of subsampling (such as missing_data, stochastics etc.), we need to put this into an inner + loop, in order to ensure a different handling of gradients etc of different + subsets of data. + + The dict in current_values will be passed aroung as current_values for + the rest of the algorithm, so this is the place to store current values, + such as subsets etc, if necessary. + + If Lm and dL_dKmm can be precomputed (or only need to be computed once) + pass them in here, so they will be passed to the inference_method. + + subset_indices is a dictionary of indices. you can put the indices however you + like them into this dictionary for inner use of the indices inside the + algorithm. + """ + posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None) + current_values = {} + likelihood.update_gradients(grad_dict['dL_dthetaL']) + current_values['likgrad'] = likelihood.gradient.copy() + if subset_indices is None: + subset_indices = {} + if isinstance(X, VariationalPosterior): #gradients wrt kernel - dL_dKmm = self.grad_dict['dL_dKmm'] - self.kern.update_gradients_full(dL_dKmm, self.Z, None) - target = self.kern.gradient.copy() - self.kern.update_gradients_expectations(variational_posterior=self.X, - Z=self.Z, - dL_dpsi0=self.grad_dict['dL_dpsi0'], - dL_dpsi1=self.grad_dict['dL_dpsi1'], - dL_dpsi2=self.grad_dict['dL_dpsi2']) - self.kern.gradient += target + dL_dKmm = grad_dict['dL_dKmm'] + kern.update_gradients_full(dL_dKmm, Z, None) + current_values['kerngrad'] = kern.gradient.copy() + kern.update_gradients_expectations(variational_posterior=X, + Z=Z, + dL_dpsi0=grad_dict['dL_dpsi0'], + dL_dpsi1=grad_dict['dL_dpsi1'], + dL_dpsi2=grad_dict['dL_dpsi2']) + current_values['kerngrad'] += kern.gradient #gradients wrt Z - self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z) - self.Z.gradient += self.kern.gradients_Z_expectations( - self.grad_dict['dL_dpsi0'], - self.grad_dict['dL_dpsi1'], - self.grad_dict['dL_dpsi2'], - Z=self.Z, - variational_posterior=self.X) + current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z) + current_values['Zgrad'] += kern.gradients_Z_expectations( + grad_dict['dL_dpsi0'], + grad_dict['dL_dpsi1'], + grad_dict['dL_dpsi2'], + Z=Z, + variational_posterior=X) else: #gradients wrt kernel - self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X) - target = self.kern.gradient.copy() - self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z) - target += self.kern.gradient - self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None) - self.kern.gradient += target + kern.update_gradients_diag(grad_dict['dL_dKdiag'], X) + current_values['kerngrad'] = kern.gradient.copy() + kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z) + current_values['kerngrad'] += kern.gradient + kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None) + current_values['kerngrad'] += kern.gradient #gradients wrt Z - self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) - self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X) + current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z) + current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X) + return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices + + def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None): + """ + This is for automatic updates of values in the inner loop of missing + data handling. Both arguments are dictionaries and the values in + full_values will be updated by the current_gradients. + + If a key from current_values does not exist in full_values, it will be + initialized to the value in current_values. + + If there is indices needed for the update, value_indices can be used for + that. If value_indices has the same key, as current_values, the update + in full_values will be indexed by the indices in value_indices. + + grads: + dictionary of standing gradients (you will have to carefully make sure, that + the ordering is right!). The values in here will be updated such that + full_values[key] += current_values[key] forall key in full_gradients.keys() + + gradients: + dictionary of gradients in the current set of parameters. + + value_indices: + dictionary holding indices for the update in full_values. + if the key exists the update rule is: + full_values[key][value_indices[key]] += current_values[key] + """ + for key in current_values.keys(): + if value_indices is not None and value_indices.has_key(key): + index = value_indices[key] + else: + index = slice(None) + if full_values.has_key(key): + full_values[key][index] += current_values[key] + else: + full_values[key] = current_values[key] + + def _inner_values_update(self, current_values): + """ + This exists if there is more to do with the current values. + It will be called allways in the inner loop, so that + you can do additional inner updates for the inside of the missing data + loop etc. This can also be used for stochastic updates, when only working on + one dimension of the output. + """ + pass + + def _outer_values_update(self, full_values): + """ + Here you put the values, which were collected before in the right places. + E.g. set the gradients of parameters, etc. + """ + self.likelihood.gradient = full_values['likgrad'] + self.kern.gradient = full_values['kerngrad'] + self.Z.gradient = full_values['Zgrad'] + + def _outer_init_full_values(self): + """ + If full_values has indices in values_indices, we might want to initialize + the full_values differently, so that subsetting is possible. + + Here you can initialize the full_values for the values needed. + + Keep in mind, that if a key does not exist in full_values when updating + values, it will be set (so e.g. for Z there is no need to initialize Zgrad, + as there is no subsetting needed. For X in BGPLVM on the other hand we probably need + to initialize the gradients for the mean and the variance in order to + have the full gradient for indexing) + """ + return {} + + def _outer_loop_for_missing_data(self): + Lm = None + dL_dKmm = None + Kmm = None + + self._log_marginal_likelihood = 0 + full_values = self._outer_init_full_values() + + woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim)) + woodbury_vector = np.zeros((self.num_inducing, self.output_dim)) + m_f = lambda i: "Inference with missing data: {: >7.2%}".format(float(i+1)/self.output_dim) + message = m_f(-1) + print message, + for d in xrange(self.output_dim): + ninan = self.ninan[:, d] + print ' '*(len(message)) + '\r', + message = m_f(d) + print message, + + posterior, log_marginal_likelihood, \ + grad_dict, current_values, value_indices = self._inner_parameters_changed( + self.kern, self.X[ninan], + self.Z, self.likelihood, + self.Ylist[d], self.Y_metadata, + Lm, dL_dKmm, + subset_indices=dict(outputs=d, samples=ninan)) + + self._inner_take_over_or_update(full_values, current_values, value_indices) + self._inner_values_update(current_values) + + Lm = posterior.K_chol + dL_dKmm = grad_dict['dL_dKmm'] + Kmm = posterior._K + woodbury_inv[:, :, d] = posterior.woodbury_inv + woodbury_vector[:, d:d+1] = posterior.woodbury_vector + self._log_marginal_likelihood += log_marginal_likelihood + print '' + + self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector, + K=Kmm, mean=None, cov=None, K_chol=Lm) + self._outer_values_update(full_values) + + def parameters_changed(self): + if self.missing_data: + self._outer_loop_for_missing_data() + else: + self.posterior, self._log_marginal_likelihood, self.grad_dict, gradients, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata) + self.kern.gradient = gradients['kerngrad'] + self.Z.gradient = gradients['Zgrad'] def _raw_predict(self, Xnew, full_cov=False, kern=None): """ diff --git a/GPy/core/sparse_gp_mpi.py b/GPy/core/sparse_gp_mpi.py index e7faf7a8..d0008e58 100644 --- a/GPy/core/sparse_gp_mpi.py +++ b/GPy/core/sparse_gp_mpi.py @@ -34,15 +34,15 @@ class SparseGP_MPI(SparseGP): """ - def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False): + def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False, missing_data=False): self._IN_OPTIMIZATION_ = False if mpi_comm != None: if inference_method is None: inference_method = VarDTC_minibatch(mpi_comm=mpi_comm) else: assert isinstance(inference_method, VarDTC_minibatch), 'inference_method has to support MPI!' - - super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer) + + super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer, missing_data=missing_data) self.update_model(False) self.link_parameter(self.X, index=0) if variational_prior is not None: @@ -75,18 +75,18 @@ class SparseGP_MPI(SparseGP): return dc #===================================================== - # The MPI parallelization + # The MPI parallelization # - can move to model at some point #===================================================== - + @SparseGP.optimizer_array.setter def optimizer_array(self, p): if self.mpi_comm != None: if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0: self.mpi_comm.Bcast(np.int32(1),root=0) - self.mpi_comm.Bcast(p, root=0) + self.mpi_comm.Bcast(p, root=0) SparseGP.optimizer_array.fset(self,p) - + def optimize(self, optimizer=None, start=None, **kwargs): self._IN_OPTIMIZATION_ = True if self.mpi_comm==None: diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index f79f4e6f..bf99dd66 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -367,14 +367,14 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1, Y = Ylist[0] k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q) - inan = _np.random.binomial(1, .8, size=Y.shape).astype(bool) # 80% missing data + inan = _np.random.binomial(1, .2, size=Y.shape).astype(bool) # 80% missing data Ymissing = Y.copy() Ymissing[inan] = _np.nan m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing, - inference_method=VarDTCMissingData(inan=inan), kernel=k) + kernel=k, missing_data=True) - m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape) + m.X.variance[:] = _np.random.uniform(0,.1,m.X.shape) m.likelihood.variance = .01 m.parameters_changed() m.Yreal = Y diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py index 70637e3b..b10ec832 100644 --- a/GPy/inference/latent_function_inference/var_dtc.py +++ b/GPy/inference/latent_function_inference/var_dtc.py @@ -62,7 +62,9 @@ class VarDTC(LatentFunctionInference): def get_VVTfactor(self, Y, prec): return Y * prec # TODO chache this, and make it effective - def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): + + + def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None): _, output_dim = Y.shape uncertain_inputs = isinstance(X, VariationalPosterior) @@ -84,8 +86,8 @@ class VarDTC(LatentFunctionInference): Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) - Lm = jitchol(Kmm) - + if Lm is None: + Lm = jitchol(Kmm) # The rather complex computations of A, and the psi stats if uncertain_inputs: @@ -100,7 +102,6 @@ class VarDTC(LatentFunctionInference): else: psi0 = kern.Kdiag(X) psi1 = kern.K(X, Z) - psi2 = None if het_noise: tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1))) else: @@ -122,11 +123,12 @@ class VarDTC(LatentFunctionInference): delit = tdot(_LBi_Lmi_psi1Vf) data_fit = np.trace(delit) DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit) - delit = -0.5 * DBi_plus_BiPBi - delit += -0.5 * B * output_dim - delit += output_dim * np.eye(num_inducing) - # Compute dL_dKmm - dL_dKmm = backsub_both_sides(Lm, delit) + if dL_dKmm is None: + delit = -0.5 * DBi_plus_BiPBi + delit += -0.5 * B * output_dim + delit += output_dim * np.eye(num_inducing) + # Compute dL_dKmm + dL_dKmm = backsub_both_sides(Lm, delit) # derivatives of L w.r.t. psi dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, @@ -208,6 +210,7 @@ class VarDTCMissingData(LatentFunctionInference): inan = np.isnan(Y) has_none = inan.any() self._inan = ~inan + inan = self._inan else: inan = self._inan has_none = True @@ -241,7 +244,7 @@ class VarDTCMissingData(LatentFunctionInference): self._subarray_indices = [[slice(None),slice(None)]] return [Y], [(Y**2).sum()] - def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): + def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None): if isinstance(X, VariationalPosterior): uncertain_inputs = True psi0_all = kern.psi0(Z, X) @@ -260,7 +263,7 @@ class VarDTCMissingData(LatentFunctionInference): dL_dpsi0_all = np.zeros(Y.shape[0]) dL_dpsi1_all = np.zeros((Y.shape[0], num_inducing)) if uncertain_inputs: - dL_dpsi2_all = np.zeros((num_inducing, num_inducing)) + dL_dpsi2_all = np.zeros((Y.shape[0], num_inducing, num_inducing)) dL_dR = 0 woodbury_vector = np.zeros((num_inducing, Y.shape[1])) @@ -271,31 +274,31 @@ class VarDTCMissingData(LatentFunctionInference): Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) #factor Kmm - Lm = jitchol(Kmm) + if Lm is None: + Lm = jitchol(Kmm) if uncertain_inputs: LmInv = dtrtri(Lm) - size = Y.shape[1] + size = len(Ys) next_ten = 0 - for i, [y, v, trYYT] in enumerate(itertools.izip(Ys, self._inan.T, traces)): if ((i+1.)/size) >= next_ten: logger.info('inference {:> 6.1%}'.format((i+1.)/size)) next_ten += .1 if het_noise: beta = beta_all[i] else: beta = beta_all - VVT_factor = (y*beta) output_dim = 1#len(ind) psi0 = psi0_all[v] psi1 = psi1_all[v, :] - if uncertain_inputs: psi2 = kern.psi2(Z, X[v, :]) + if uncertain_inputs: + psi2 = kern.psi2(Z, X[v, :]) else: psi2 = None num_data = psi1.shape[0] if uncertain_inputs: if het_noise: psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0) - else: psi2_beta = psi2 * beta + else: psi2_beta = psi2 * (beta) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: if het_noise: tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1))) @@ -330,11 +333,11 @@ class VarDTCMissingData(LatentFunctionInference): dL_dpsi0_all[v] += dL_dpsi0 dL_dpsi1_all[v, :] += dL_dpsi1 if uncertain_inputs: - dL_dpsi2_all += dL_dpsi2 + dL_dpsi2_all[v, :, :] += dL_dpsi2 # log marginal likelihood log_marginal += _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, - psi0, A, LB, trYYT, data_fit,VVT_factor) + psi0, A, LB, trYYT, data_fit, VVT_factor) #put the gradients in the right places dL_dR += _compute_dL_dR(likelihood, @@ -393,7 +396,6 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C # subsume back into psi1 (==Kmn) dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2) dL_dpsi2 = None - return dL_dpsi0, dL_dpsi1, dL_dpsi2 diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py index 93399ea7..d2d3312d 100644 --- a/GPy/kern/_src/psi_comp/rbf_psi_comp.py +++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py @@ -17,7 +17,7 @@ def psicomputations(variance, lengthscale, Z, variational_posterior): # _psi1 NxM mu = variational_posterior.mean S = variational_posterior.variance - + psi0 = np.empty(mu.shape[0]) psi0[:] = variance psi1 = _psi1computations(variance, lengthscale, Z, mu, S) @@ -41,7 +41,7 @@ def __psi1computations(variance, lengthscale, Z, mu, S): _psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N _psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.) _psi1 = variance*np.exp(_psi1_log) - + return _psi1 def __psi2computations(variance, lengthscale, Z, mu, S): @@ -54,27 +54,27 @@ def __psi2computations(variance, lengthscale, Z, mu, S): # here are the "statistics" for psi2 # Produced intermediate results: # _psi2 MxM - + lengthscale2 = np.square(lengthscale) - + _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ denom = 1./(2.*S+lengthscale2) _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom) _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2) - + return _psi2 def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): ARD = (len(lengthscale)!=1) - + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance) dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance) dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 - + dL_dlengscale = dl_psi1 + dl_psi2 if not ARD: dL_dlengscale = dL_dlengscale.sum() @@ -82,7 +82,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal dL_dmu = dmu_psi1 + dmu_psi2 dL_dS = dS_psi1 + dS_psi2 dL_dZ = dZ_psi1 + dZ_psi2 - + return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S): @@ -101,9 +101,9 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S): # _dL_dgamma NxQ # _dL_dmu NxQ # _dL_dS NxQ - + lengthscale2 = np.square(lengthscale) - + _psi1 = _psi1computations(variance, lengthscale, Z, mu, S) Lpsi1 = dL_dpsi1*_psi1 Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ @@ -114,7 +114,7 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S): _dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2. _dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom) _dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale) - + return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): @@ -133,13 +133,12 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): # _dL_dgamma NxQ # _dL_dmu NxQ # _dL_dS NxQ - + lengthscale2 = np.square(lengthscale) denom = 1./(2*S+lengthscale2) denom2 = np.square(denom) _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM - Lpsi2 = dL_dpsi2[None,:,:]*_psi2 Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ @@ -147,7 +146,7 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ Lpsi2Zhat = Lpsi2Z Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2 - + _dL_dvar = Lpsi2sum.sum()*2/variance _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat) _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None] @@ -157,6 +156,6 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0) return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS - + _psi1computations = Cacher(__psi1computations, limit=1) _psi2computations = Cacher(__psi2computations, limit=1) diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index 5c5e7b5c..54c11fea 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -25,7 +25,9 @@ class BayesianGPLVM(SparseGP_MPI): """ def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10, - Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', mpi_comm=None, normalizer=None): + Z=None, kernel=None, inference_method=None, likelihood=None, + name='bayesian gplvm', mpi_comm=None, normalizer=None, + missing_data=False): self.mpi_comm = mpi_comm self.__IN_OPTIMIZATION__ = False @@ -59,24 +61,23 @@ class BayesianGPLVM(SparseGP_MPI): X = NormalPosterior(X, X_variance) if inference_method is None: - inan = np.isnan(Y) - if np.any(inan): - from ..inference.latent_function_inference.var_dtc import VarDTCMissingData - self.logger.debug("creating inference_method with var_dtc missing data") - inference_method = VarDTCMissingData(inan=inan) - elif mpi_comm is not None: + if mpi_comm is not None: inference_method = VarDTC_minibatch(mpi_comm=mpi_comm) else: from ..inference.latent_function_inference.var_dtc import VarDTC self.logger.debug("creating inference_method var_dtc") - inference_method = VarDTC() + inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1]) if isinstance(inference_method,VarDTC_minibatch): inference_method.mpi_comm = mpi_comm if kernel.useGPU and isinstance(inference_method, VarDTC_GPU): kernel.psicomp.GPU_direct = True - super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood, name=name, inference_method=inference_method, normalizer=normalizer, mpi_comm=mpi_comm, variational_prior=self.variational_prior) + super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood, + name=name, inference_method=inference_method, + normalizer=normalizer, mpi_comm=mpi_comm, + variational_prior=self.variational_prior, + missing_data=missing_data) def set_X_gradients(self, X, X_grad): """Set the gradients of the posterior distribution of X in its specific form.""" @@ -86,15 +87,48 @@ class BayesianGPLVM(SparseGP_MPI): """Get the gradients of the posterior distribution of X in its specific form.""" return X.mean.gradient, X.variance.gradient + def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None): + posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVM, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices) + + log_marginal_likelihood -= self.variational_prior.KL_divergence(X) + + current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations( + variational_posterior=X, + Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'], + dL_dpsi1=grad_dict['dL_dpsi1'], + dL_dpsi2=grad_dict['dL_dpsi2']) + X.mean.gradient[:] = 0 + X.variance.gradient[:] = 0 + self.variational_prior.update_gradients_KL(X) + current_values['meangrad'] += X.mean.gradient + current_values['vargrad'] += X.variance.gradient + + value_indices['meangrad'] = subset_indices['samples'] + value_indices['vargrad'] = subset_indices['samples'] + return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices + + def _outer_values_update(self, full_values): + """ + Here you put the values, which were collected before in the right places. + E.g. set the gradients of parameters, etc. + """ + super(BayesianGPLVM, self)._outer_values_update(full_values) + self.X.mean.gradient = full_values['meangrad'] + self.X.variance.gradient = full_values['vargrad'] + + def _outer_init_full_values(self): + return dict(meangrad=np.zeros(self.X.mean.shape), + vargrad=np.zeros(self.X.variance.shape)) + def parameters_changed(self): super(BayesianGPLVM,self).parameters_changed() if isinstance(self.inference_method, VarDTC_minibatch): return - super(BayesianGPLVM, self).parameters_changed() - self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X) + #super(BayesianGPLVM, self).parameters_changed() + #self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X) - self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2']) + #self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2']) # This is testing code ------------------------- # i = np.random.randint(self.X.shape[0]) @@ -113,7 +147,7 @@ class BayesianGPLVM(SparseGP_MPI): # ----------------------------------------------- # update for the KL divergence - self.variational_prior.update_gradients_KL(self.X) + #self.variational_prior.update_gradients_KL(self.X) def plot_latent(self, labels=None, which_indices=None, resolution=50, ax=None, marker='o', s=40, @@ -178,7 +212,7 @@ class BayesianGPLVM(SparseGP_MPI): """ dmu_dX = np.zeros_like(Xnew) for i in range(self.Z.shape[0]): - dmu_dX += self.kern.gradients_X(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :]) + dmu_dX += self.kern.gradients_X(self.grad_dict['dL_dpsi1'][i:i + 1, :], Xnew, self.Z[i:i + 1, :]) return dmu_dX def dmu_dXnew(self, Xnew): @@ -189,7 +223,7 @@ class BayesianGPLVM(SparseGP_MPI): ones = np.ones((1, 1)) for i in range(self.Z.shape[0]): gradients_X[:, i] = self.kern.gradients_X(ones, Xnew, self.Z[i:i + 1, :]).sum(-1) - return np.dot(gradients_X, self.Cpsi1Vf) + return np.dot(gradients_X, self.grad_dict['dL_dpsi1']) def plot_steepest_gradient_map(self, *args, ** kwargs): """ diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py index 42f82121..c87a44b1 100644 --- a/GPy/testing/model_tests.py +++ b/GPy/testing/model_tests.py @@ -20,7 +20,7 @@ class MiscTests(unittest.TestCase): m = GPy.models.GPRegression(self.X, self.Y, kernel=k) m.randomize() m.likelihood.variance = .5 - Kinv = np.linalg.pinv(k.K(self.X) + np.eye(self.N)*m.likelihood.variance) + Kinv = np.linalg.pinv(k.K(self.X) + np.eye(self.N) * m.likelihood.variance) K_hat = k.K(self.X_new) - k.K(self.X_new, self.X).dot(Kinv).dot(k.K(self.X, self.X_new)) mu_hat = k.K(self.X_new, self.X).dot(Kinv).dot(m.Y_normalized) @@ -43,7 +43,7 @@ class MiscTests(unittest.TestCase): Z = m.Z[:] X = self.X[:] - #Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression + # Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression Kinv = m.posterior.woodbury_inv K_hat = k.K(self.X_new) - k.K(self.X_new, Z).dot(Kinv).dot(k.K(Z, self.X_new)) @@ -51,13 +51,13 @@ class MiscTests(unittest.TestCase): self.assertEquals(mu.shape, (self.N_new, self.D)) self.assertEquals(covar.shape, (self.N_new, self.N_new)) np.testing.assert_almost_equal(K_hat, covar) - #np.testing.assert_almost_equal(mu_hat, mu) + # np.testing.assert_almost_equal(mu_hat, mu) mu, var = m._raw_predict(self.X_new) self.assertEquals(mu.shape, (self.N_new, self.D)) self.assertEquals(var.shape, (self.N_new, 1)) np.testing.assert_almost_equal(np.diag(K_hat)[:, None], var) - #np.testing.assert_almost_equal(mu_hat, mu) + # np.testing.assert_almost_equal(mu_hat, mu) def test_likelihood_replicate(self): m = GPy.models.GPRegression(self.X, self.Y) @@ -110,6 +110,29 @@ class MiscTests(unittest.TestCase): m2.kern.lengthscale = m.kern['.*lengthscale'] np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood()) + def test_missing_data(self): + from GPy import kern + from GPy.models import BayesianGPLVM + from GPy.examples.dimensionality_reduction import _simulate_matern + + D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4 + _, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False) + Y = Ylist[0] + + inan = np.random.binomial(1, .9, size=Y.shape).astype(bool) # 80% missing data + Ymissing = Y.copy() + Ymissing[inan] = np.nan + + k = kern.Linear(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q) + m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing, + kernel=k, missing_data=True) + assert(m.checkgrad()) + + k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q) + m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing, + kernel=k, missing_data=True) + assert(m.checkgrad()) + def test_likelihood_replicate_kern(self): m = GPy.models.GPRegression(self.X, self.Y) m2 = GPy.models.GPRegression(self.X, self.Y) @@ -158,7 +181,7 @@ class MiscTests(unittest.TestCase): def test_model_optimize(self): X = np.random.uniform(-3., 3., (20, 1)) Y = np.sin(X) + np.random.randn(20, 1) * 0.05 - m = GPy.models.GPRegression(X,Y) + m = GPy.models.GPRegression(X, Y) m.optimize() print m @@ -219,8 +242,8 @@ class GradientTests(np.testing.TestCase): mlp = GPy.kern.MLP(1) self.check_model(mlp, model_type='GPRegression', dimension=1) - #TODO: - #def test_GPRegression_poly_1d(self): + # TODO: + # def test_GPRegression_poly_1d(self): # ''' Testing the GP regression with polynomial kernel with white kernel on 1d data ''' # mlp = GPy.kern.Poly(1, degree=5) # self.check_model(mlp, model_type='GPRegression', dimension=1) @@ -417,11 +440,11 @@ class GradientTests(np.testing.TestCase): def test_gp_heteroscedastic_regression(self): num_obs = 25 - X = np.random.randint(0,140,num_obs) - X = X[:,None] - Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None] + X = np.random.randint(0, 140, num_obs) + X = X[:, None] + Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None] kern = GPy.kern.Bias(1) + GPy.kern.RBF(1) - m = GPy.models.GPHeteroscedasticRegression(X,Y,kern) + m = GPy.models.GPHeteroscedasticRegression(X, Y, kern) self.assertTrue(m.checkgrad()) def test_gp_kronecker_gaussian(self): @@ -432,12 +455,12 @@ class GradientTests(np.testing.TestCase): k1 = GPy.kern.RBF(1) # + GPy.kern.White(1) k2 = GPy.kern.RBF(1) # + GPy.kern.White(1) Y = np.random.randn(N1, N2) - Y = Y-Y.mean(0) - Y = Y/Y.std(0) + Y = Y - Y.mean(0) + Y = Y / Y.std(0) m = GPy.models.GPKroneckerGaussianRegression(X1, X2, Y, k1, k2) # build the model the dumb way - assert (N1*N2<1000), "too much data for standard GPs!" + assert (N1 * N2 < 1000), "too much data for standard GPs!" yy, xx = np.meshgrid(X2, X1) Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T kg = GPy.kern.RBF(1, active_dims=[0]) * GPy.kern.RBF(1, active_dims=[1]) @@ -458,11 +481,11 @@ class GradientTests(np.testing.TestCase): def test_gp_VGPC(self): num_obs = 25 - X = np.random.randint(0,140,num_obs) - X = X[:,None] - Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None] + X = np.random.randint(0, 140, num_obs) + X = X[:, None] + Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None] kern = GPy.kern.Bias(1) + GPy.kern.RBF(1) - m = GPy.models.GPVariationalGaussianApproximation(X,Y,kern) + m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern) self.assertTrue(m.checkgrad())