From b5cac9af8ca48531a1a03180f418cfcea2f30e3a Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Fri, 22 May 2015 14:29:53 +0100 Subject: [PATCH] [ssmrd] implement with IBP prior --- GPy/core/parameterization/variational.py | 10 +- GPy/models/ss_gplvm.py | 26 ++- GPy/models/ss_mrd.py | 253 +++++++++++++++++++++-- 3 files changed, 262 insertions(+), 27 deletions(-) diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py index 16192845..7a58ac5e 100644 --- a/GPy/core/parameterization/variational.py +++ b/GPy/core/parameterization/variational.py @@ -80,7 +80,7 @@ class SpikeAndSlabPrior(VariationalPrior): pi = self.pi if self.group_spike: - dgamma = np.log((1-pi)/pi*gamma/(1.-gamma))/mu.shape[0] + dgamma = np.log((1-pi)/pi*gamma/(1.-gamma))/variational_posterior.num_data else: dgamma = np.log((1-pi)/pi*gamma/(1.-gamma)) variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2. @@ -165,12 +165,16 @@ class SpikeAndSlabPosterior(VariationalPosterior): ''' The SpikeAndSlab distribution for variational approximations. ''' - def __init__(self, means, variances, binary_prob, group_spike=False, name='latent space'): + def __init__(self, means, variances, binary_prob, group_spike=False, sharedX=False, name='latent space'): """ binary_prob : the probability of the distribution on the slab part. """ super(SpikeAndSlabPosterior, self).__init__(means, variances, name) self.group_spike = group_spike + self.sharedX = sharedX + if sharedX: + self.mean.fix(warning=False) + self.variance.fix(warning=False) if group_spike: self.gamma_group = Param("binary_prob_group",binary_prob.mean(axis=0),Logistic(1e-6,1.-1e-6)) self.gamma = Param("binary_prob",binary_prob, __fixed__) @@ -181,7 +185,7 @@ class SpikeAndSlabPosterior(VariationalPosterior): def propogate_val(self): if self.group_spike: - self.gamma.param_array.values.reshape(self.gamma.shape)[:] = self.gamma_group.values + self.gamma.values[:] = self.gamma_group.values def collate_gradient(self): if self.group_spike: diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py index 64d0fd5e..3e2aa552 100644 --- a/GPy/models/ss_gplvm.py +++ b/GPy/models/ss_gplvm.py @@ -15,12 +15,16 @@ class IBPPosterior(SpikeAndSlabPosterior): ''' The SpikeAndSlab distribution for variational approximations. ''' - def __init__(self, means, variances, binary_prob, tau=None, name='latent space'): + def __init__(self, means, variances, binary_prob, tau=None, sharedX=False, name='latent space'): """ binary_prob : the probability of the distribution on the slab part. """ from ..core.parameterization.transformations import Logexp super(IBPPosterior, self).__init__(means, variances, binary_prob, group_spike=True, name=name) + self.sharedX = sharedX + if sharedX: + self.mean.fix(warning=False) + self.variance.fix(warning=False) self.tau = Param("tau_", np.ones((self.gamma_group.shape[0],2)), Logexp()) self.link_parameter(self.tau) @@ -83,7 +87,7 @@ class IBPPrior(VariationalPrior): variational_posterior.mean.gradient -= gamma*mu/self.variance variational_posterior.variance.gradient -= (1./self.variance - 1./S) * gamma /2. from scipy.special import digamma,polygamma - dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/mu.shape[0] + dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data variational_posterior.binary_prob.gradient -= dgamma+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2. ad = self.alpha/self.input_dim common = (ad+2-tau[:,0]-tau[:,1])*polygamma(1,tau.sum(axis=1)) @@ -103,9 +107,11 @@ class SSGPLVM(SparseGP_MPI): """ def __init__(self, Y, input_dim, X=None, X_variance=None, Gamma=None, init='PCA', num_inducing=10, - Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, IBP=False, alpha=2., tau=None, mpi_comm=None, pi=None, learnPi=False,normalizer=False, **kwargs): + Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, IBP=False, alpha=2., tau=None, mpi_comm=None, pi=None, learnPi=False,normalizer=False, sharedX=False, variational_prior=None,**kwargs): self.group_spike = group_spike + self.init = init + self.sharedX = sharedX if X == None: from ..util.initialization import initialize_latent @@ -113,8 +119,6 @@ class SSGPLVM(SparseGP_MPI): else: fracs = np.ones(input_dim) - self.init = init - if X_variance is None: # The variance of the variational approximation (S) X_variance = np.random.uniform(0,.1,X.shape) @@ -146,11 +150,11 @@ class SSGPLVM(SparseGP_MPI): pi[:] = 0.5 if IBP: - self.variational_prior = IBPPrior(input_dim=input_dim, alpha=alpha) - X = IBPPosterior(X, X_variance, gamma, tau=tau) + self.variational_prior = IBPPrior(input_dim=input_dim, alpha=alpha) if variational_prior is None else variational_prior + X = IBPPosterior(X, X_variance, gamma, tau=tau,sharedX=sharedX) else: - self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi, group_spike=group_spike) # the prior probability of the latent binary variable b - X = SpikeAndSlabPosterior(X, X_variance, gamma, group_spike=group_spike) + self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=learnPi, group_spike=group_spike) if variational_prior is None else variational_prior + X = SpikeAndSlabPosterior(X, X_variance, gamma, group_spike=group_spike,sharedX=sharedX) super(SSGPLVM,self).__init__(X, Y, Z, kernel, likelihood, variational_prior=self.variational_prior, inference_method=inference_method, name=name, mpi_comm=mpi_comm, normalizer=normalizer, **kwargs) self.link_parameter(self.X, index=0) @@ -163,8 +167,12 @@ class SSGPLVM(SparseGP_MPI): """Get the gradients of the posterior distribution of X in its specific form.""" return X.mean.gradient, X.variance.gradient, X.binary_prob.gradient + def _propogate_X_val(self): + pass + def parameters_changed(self): self.X.propogate_val() + if self.sharedX: self._highest_parent_._propogate_X_val() super(SSGPLVM,self).parameters_changed() if isinstance(self.inference_method, VarDTC_minibatch): self.X.collate_gradient() diff --git a/GPy/models/ss_mrd.py b/GPy/models/ss_mrd.py index bd2efce0..41289d3f 100644 --- a/GPy/models/ss_mrd.py +++ b/GPy/models/ss_mrd.py @@ -2,33 +2,256 @@ The Maniforld Relevance Determination model with the spike-and-slab prior """ +import numpy as np from ..core import Model from .ss_gplvm import SSGPLVM +from ..core.parameterization.variational import SpikeAndSlabPrior,NormalPosterior,VariationalPrior +from ..util.misc import param_to_array +from ..kern import RBF +from ..core import Param +from numpy.linalg.linalg import LinAlgError class SSMRD(Model): - def __init__(self, Ylist, input_dim, X=None, X_variance=None, - initx = 'PCA', initz = 'permute', - num_inducing=10, Z=None, kernel=None, - inference_method=None, likelihoods=None, name='ss_mrd', Ynames=None): + def __init__(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx = 'PCA_concat', initz = 'permute', + num_inducing=10, Zs=None, kernels=None, inference_methods=None, likelihoods=None, group_spike=True, + pi=0.5, name='ss_mrd', Ynames=None, mpi_comm=None, IBP=False, alpha=2., taus=None, ): super(SSMRD, self).__init__(name) + self.mpi_comm = mpi_comm + self._PROPAGATE_ = False - self.updates = False - self.models = [SSGPLVM(y, input_dim, X=X, X_variance=X_variance, num_inducing=num_inducing,Z=Z,init=initx, - kernel=kernel.copy() if kernel else None,inference_method=inference_method,likelihood=likelihoods, - name='model_'+str(i)) for i,y in enumerate(Ylist)] - self.add_parameters(*(self.models)) + # initialize X for individual models + X, X_variance, Gammas, fracs = self._init_X(Ylist, input_dim, X, X_variance, Gammas, initx) + self.X = NormalPosterior(means=X, variances=X_variance) - [[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.mean.shape[1])] - for i in range(self.models[0].X.mean.shape[0])] - [[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in range(len(self.models))] for j in range(self.models[0].X.variance.shape[1])] - for i in range(self.models[0].X.variance.shape[0])] + if kernels is None: + kernels = [RBF(input_dim, lengthscale=1./fracs, ARD=True) for i in xrange(len(Ylist))] + if Zs is None: + Zs = [None]* len(Ylist) + if likelihoods is None: + likelihoods = [None]* len(Ylist) + if inference_methods is None: + inference_methods = [None]* len(Ylist) - self.updates = True + if IBP: + self.var_priors = [IBPPrior_SSMRD(len(Ylist),input_dim,alpha=alpha) for i in xrange(len(Ylist))] + else: + self.var_priors = [SpikeAndSlabPrior_SSMRD(nModels=len(Ylist),pi=pi,learnPi=False, group_spike=group_spike) for i in xrange(len(Ylist))] + self.models = [SSGPLVM(y, input_dim, X=X.copy(), X_variance=X_variance.copy(), Gamma=Gammas[i], num_inducing=num_inducing,Z=Zs[i], learnPi=False, group_spike=group_spike, + kernel=kernels[i],inference_method=inference_methods[i],likelihood=likelihoods[i], variational_prior=self.var_priors[i], IBP=IBP, tau=None if taus is None else taus[i], + name='model_'+str(i), mpi_comm=mpi_comm, sharedX=True) for i,y in enumerate(Ylist)] + self.link_parameters(*(self.models+[self.X])) + + def _propogate_X_val(self): + if self._PROPAGATE_: return + for m in self.models: + m.X.mean.values[:] = self.X.mean.values + m.X.variance.values[:] = self.X.variance.values + varp_list = [m.X for m in self.models] + [vp._update_inernal(varp_list) for vp in self.var_priors] + self._PROPAGATE_=True + + def _collate_X_gradient(self): + self._PROPAGATE_ = False + self.X.mean.gradient[:] = 0 + self.X.variance.gradient[:] = 0 + for m in self.models: + self.X.mean.gradient += m.X.mean.gradient + self.X.variance.gradient += m.X.variance.gradient def parameters_changed(self): - super(SSMRD, self).parameters_changed() + super(SSMRD, self).parameters_changed() + [m.parameters_changed() for m in self.models] self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models]) + self._collate_X_gradient() def log_likelihood(self): return self._log_marginal_likelihood + + def _init_X(self, Ylist, input_dim, X=None, X_variance=None, Gammas=None, initx='PCA_concat'): + + # Divide latent dimensions + idx = np.empty((input_dim,),dtype=np.int) + residue = (input_dim)%(len(Ylist)) + for i in xrange(len(Ylist)): + if i < residue: + size = input_dim/len(Ylist)+1 + idx[i*size:(i+1)*size] = i + else: + size = input_dim/len(Ylist) + idx[i*size+residue:(i+1)*size+residue] = i + + if X is None: + if initx == 'PCA_concat': + X = np.empty((Ylist[0].shape[0],input_dim)) + fracs = np.empty((input_dim,)) + from ..util.initialization import initialize_latent + for i in xrange(len(Ylist)): + Y = Ylist[i] + dim = (idx==i).sum() + if dim>0: + x, fr = initialize_latent('PCA', dim, Y) + X[:,idx==i] = x + fracs[idx==i] = fr + elif initx=='PCA_joint': + y = np.hstack(Ylist) + from ..util.initialization import initialize_latent + X, fracs = initialize_latent('PCA', input_dim, y) + else: + X = np.random.randn(Ylist[0].shape[0], input_dim) + fracs = np.ones(input_dim) + else: + fracs = np.ones(input_dim) + + + if X_variance is None: # The variance of the variational approximation (S) + X_variance = np.random.uniform(0,.1,X.shape) + + if Gammas is None: + Gammas = [] + for x in X: + gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation + gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim) + gamma[gamma>1.-1e-9] = 1.-1e-9 + gamma[gamma<1e-9] = 1e-9 + Gammas.append(gamma) + return X, X_variance, Gammas, fracs + + @Model.optimizer_array.setter + def optimizer_array(self, p): + if self.mpi_comm != None: + if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0: + self.mpi_comm.Bcast(np.int32(1),root=0) + self.mpi_comm.Bcast(p, root=0) + Model.optimizer_array.fset(self,p) + + def optimize(self, optimizer=None, start=None, **kwargs): + self._IN_OPTIMIZATION_ = True + if self.mpi_comm==None: + super(SSMRD, self).optimize(optimizer,start,**kwargs) + elif self.mpi_comm.rank==0: + super(SSMRD, self).optimize(optimizer,start,**kwargs) + self.mpi_comm.Bcast(np.int32(-1),root=0) + elif self.mpi_comm.rank>0: + x = self.optimizer_array.copy() + flag = np.empty(1,dtype=np.int32) + while True: + self.mpi_comm.Bcast(flag,root=0) + if flag==1: + try: + self.optimizer_array = x + self._fail_count = 0 + except (LinAlgError, ZeroDivisionError, ValueError): + if self._fail_count >= self._allowed_failures: + raise + self._fail_count += 1 + elif flag==-1: + break + else: + self._IN_OPTIMIZATION_ = False + raise Exception("Unrecognizable flag for synchronization!") + self._IN_OPTIMIZATION_ = False + + +class SpikeAndSlabPrior_SSMRD(SpikeAndSlabPrior): + def __init__(self, nModels, pi=0.5, learnPi=False, group_spike=True, variance = 1.0, name='SSMRDPrior', **kw): + self.nModels = nModels + self._b_prob_all = 0.5 + super(SpikeAndSlabPrior_SSMRD, self).__init__(pi=pi,learnPi=learnPi,group_spike=group_spike,variance=variance, name=name, **kw) + + def _update_inernal(self, varp_list): + """Make an update of the internal status by gathering the variational posteriors for all the individual models.""" + # The probability for the binary variable for the same latent dimension of any of the models is on. + if self.group_spike: + self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group) + [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]] + else: + self._b_prob_all = 1.-param_to_array(varp_list[0].binary_prob) + [np.multiply(self._b_prob_all, 1.-vp.binary_prob, self._b_prob_all) for vp in varp_list[1:]] + + def KL_divergence(self, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + if self.group_spike: + gamma = variational_posterior.binary_prob[0] + else: + gamma = variational_posterior.binary_prob + if len(self.pi.shape)==2: + idx = np.unique(gamma._raveled_index()/gamma.shape[-1]) + pi = self.pi[idx] + else: + pi = self.pi + + var_mean = np.square(mu)/self.variance + var_S = (S/self.variance - np.log(S)) + var_gamma = (gamma*np.log(gamma/pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-pi))).sum() + return var_gamma +((1.-self._b_prob_all)*(np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels) + + def update_gradients_KL(self, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + N = variational_posterior.num_data + if self.group_spike: + gamma = variational_posterior.binary_prob.values[0] + else: + gamma = variational_posterior.binary_prob.values + if len(self.pi.shape)==2: + idx = np.unique(gamma._raveled_index()/gamma.shape[-1]) + pi = self.pi[idx] + else: + pi = self.pi + + if self.group_spike: + tmp = self._b_prob_all/(1.-gamma) + variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))/N +tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2. + else: + variational_posterior.binary_prob.gradient -= np.log((1-pi)/pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2. + mu.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels) + S.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels) + if self.learnPi: + raise 'Not Supported!' + +class IBPPrior_SSMRD(VariationalPrior): + def __init__(self, nModels, input_dim, alpha =2., tau=None, name='IBPPrior', **kw): + super(IBPPrior_SSMRD, self).__init__(name=name, **kw) + from ..core.parameterization.transformations import Logexp, __fixed__ + self.nModels = nModels + self._b_prob_all = 0.5 + self.input_dim = input_dim + self.variance = 1. + self.alpha = Param('alpha', alpha, __fixed__) + self.link_parameter(self.alpha) + + def _update_inernal(self, varp_list): + """Make an update of the internal status by gathering the variational posteriors for all the individual models.""" + # The probability for the binary variable for the same latent dimension of any of the models is on. + self._b_prob_all = 1.-param_to_array(varp_list[0].gamma_group) + [np.multiply(self._b_prob_all, 1.-vp.gamma_group, self._b_prob_all) for vp in varp_list[1:]] + + def KL_divergence(self, variational_posterior): + mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values + + var_mean = np.square(mu)/self.variance + var_S = (S/self.variance - np.log(S)) + part1 = ((1.-self._b_prob_all)* (np.log(self.variance)-1. +var_mean + var_S)).sum()/(2.*self.nModels) + + ad = self.alpha/self.input_dim + from scipy.special import betaln,digamma + part2 = (gamma*np.log(gamma)).sum() + ((1.-gamma)*np.log(1.-gamma)).sum() + (betaln(ad,1.)*self.input_dim -betaln(tau[:,0], tau[:,1]).sum())/self.nModels \ + + (( (tau[:,0]-ad)/self.nModels -gamma)*digamma(tau[:,0])).sum() + \ + (((tau[:,1]-1.)/self.nModels+gamma-1.)*digamma(tau[:,1])).sum() + (((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*digamma(tau.sum(axis=1))).sum() + return part1+part2 + + def update_gradients_KL(self, variational_posterior): + mu, S, gamma, tau = variational_posterior.mean.values, variational_posterior.variance.values, variational_posterior.gamma_group.values, variational_posterior.tau.values + + variational_posterior.mean.gradient -= (1.-self._b_prob_all)*mu/(self.variance*self.nModels) + variational_posterior.variance.gradient -= (1./self.variance - 1./S) * (1.-self._b_prob_all) /(2.*self.nModels) + from scipy.special import digamma,polygamma + tmp = self._b_prob_all/(1.-gamma) + dgamma = (np.log(gamma/(1.-gamma))+ digamma(tau[:,1])-digamma(tau[:,0]))/variational_posterior.num_data + variational_posterior.binary_prob.gradient -= dgamma+tmp*((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2. + ad = self.alpha/self.input_dim + common = ((1.+ad-tau[:,0]-tau[:,1])/self.nModels+1.)*polygamma(1,tau.sum(axis=1)) + variational_posterior.tau.gradient[:,0] = -(((tau[:,0]-ad)/self.nModels -gamma)*polygamma(1,tau[:,0])+common) + variational_posterior.tau.gradient[:,1] = -(((tau[:,1]-1.)/self.nModels+gamma-1.)*polygamma(1,tau[:,1])+common)