diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index 7552b8ac..333fd526 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -66,7 +66,7 @@ class SparseGP(GP): #gradients wrt Z self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z) self.Z.gradient += self.kern.gradients_Z_expectations( - self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X) + self.grad_dict['dL_dpsi0'], self.grad_dict['dL_dpsi1'], self.grad_dict['dL_dpsi2'], Z=self.Z, variational_posterior=self.X) else: #gradients wrt kernel self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X) diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py index 87236e2a..7c347213 100644 --- a/GPy/inference/latent_function_inference/var_dtc_parallel.py +++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py @@ -70,12 +70,13 @@ class VarDTC_minibatch(object): #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.variance, 1e-6) het_noise = beta.size > 1 + if het_noise: + self.batchsize = 1 # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = beta*self.get_YYTfactor(Y) YYT_factor = Y trYYT = self.get_trYYT(Y) - psi2_full = np.zeros((num_inducing,num_inducing)) psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM psi0_full = 0 @@ -104,19 +105,18 @@ class VarDTC_minibatch(object): YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() else: psi0_full += psi0.sum() - psi1Y_full += np.dot(Y_slice.T,psi1) # DxM - + psi1Y_full += np.dot(Y_slice.T,psi1) # DxM if uncertain_inputs: if het_noise: - psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2) + psi2_full += beta_slice*psi2 else: - psi2_full += psi2.sum(axis=0) + psi2_full += psi2 else: if het_noise: - psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1) + psi2_full += beta_slice*np.outer(psi1,psi1) else: - psi2_full += tdot(psi1.T) + psi2_full += np.outer(psi1,psi1) if not het_noise: psi0_full *= beta @@ -223,7 +223,7 @@ class VarDTC_minibatch(object): psi2 = None if het_noise: - beta = beta[n_start:n_end] + beta = beta[n_start] # assuming batchsize==1 betaY = beta*Y_slice betapsi1 = np.einsum('n,nm->nm',beta,psi1) @@ -244,7 +244,7 @@ class VarDTC_minibatch(object): dL_dpsi1 = np.dot(betaY,v.T) if uncertain_inputs: - dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R) + dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2. dL_dpsi2 = None @@ -262,11 +262,11 @@ class VarDTC_minibatch(object): dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) else: if uncertain_inputs: - psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) + psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2) else: psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) - dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum() + dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum() if uncertain_inputs: grad_dict = {'dL_dpsi0':dL_dpsi0, @@ -296,7 +296,7 @@ def update_gradients(model): kern_grad = model.kern.gradient.copy() #gradients w.r.t. Z - model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z) + model.Z.gradient = model.kern.gradients_X(dL_dKmm, model.Z) isEnd = False while not isEnd: @@ -309,8 +309,8 @@ def update_gradients(model): kern_grad += model.kern.gradient #gradients w.r.t. Z - model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations( - grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice) + model.Z.gradient += model.kern.gradients_Z_expectations( + dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice) #gradients w.r.t. posterior parameters of X X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py index 88b8e40c..9589d6be 100644 --- a/GPy/kern/_src/add.py +++ b/GPy/kern/_src/add.py @@ -119,7 +119,7 @@ class Add(CombinationKernel): eff_dL_dpsi1 += dL_dpsi2.sum(1) * p2.psi1(Z, variational_posterior) * 2. p1.update_gradients_expectations(dL_dpsi0, eff_dL_dpsi1, dL_dpsi2, Z, variational_posterior) - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_Z_expectations(self, dL_psi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): from static import White, Bias target = np.zeros(Z.shape) for p1 in self.parts: diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 70bd42b9..09eb6dd9 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -115,7 +115,7 @@ class Kern(Parameterized): """ raise NotImplementedError - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): """ Returns the derivative of the objective wrt Z, using the chain rule through the expectation variables. diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py index c1c8d7f1..3473ffce 100644 --- a/GPy/kern/_src/kernel_slice_operations.py +++ b/GPy/kern/_src/kernel_slice_operations.py @@ -124,9 +124,9 @@ def _slice_update_gradients_expectations(f): def _slice_gradients_Z_expectations(f): @wraps(f) - def wrap(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def wrap(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): with _Slice_wrap(self, Z, variational_posterior) as s: - ret = s.handle_return_array(f(self, dL_dpsi1, dL_dpsi2, s.X, s.X2)) + ret = s.handle_return_array(f(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, s.X, s.X2)) return ret return wrap diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py index f9dacf02..007649b0 100644 --- a/GPy/kern/_src/linear.py +++ b/GPy/kern/_src/linear.py @@ -168,7 +168,7 @@ class Linear(Kern): else: self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): gamma = variational_posterior.binary_prob mu = variational_posterior.mean diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py index d8414cfb..5e30b5c5 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py @@ -9,12 +9,23 @@ import numpy as np from GPy.util.caching import Cache_this @Cache_this(limit=1) -def _Z_distances(Z): - Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q - Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q - return Zhat, Zdist +def psicomputations(variance, lengthscale, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi0, psi1 and psi2 + # Produced intermediate results: + # _psi1 NxM + + psi0 = np.empty(mu.shape[0]) + psi0[:] = variance + psi1 = _psi1computations(variance, lengthscale, Z, mu, S, gamma) + psi2 = _psi2computations(variance, lengthscale, Z, mu, S, gamma) + return psi0, psi1, psi2 -@Cache_this(limit=1) def _psi1computations(variance, lengthscale, Z, mu, S, gamma): """ Z - MxQ @@ -22,15 +33,10 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): S - NxQ gamma - NxQ """ - # here are the "statistics" for psi1 and psi2 + # here are the "statistics" for psi1 # Produced intermediate results: # _psi1 NxM - # _dpsi1_dvariance NxM - # _dpsi1_dlengthscale NxMxQ - # _dpsi1_dZ NxMxQ - # _dpsi1_dgamma NxMxQ - # _dpsi1_dmu NxMxQ - # _dpsi1_dS NxMxQ + lengthscale2 = np.square(lengthscale) @@ -40,25 +46,15 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ - _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom)) # NxMxQ - _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ + _psi1_exponent1 = np.log(gamma[:,None,:]) - (_psi1_dist_sq + np.log(_psi1_denom))/2. # NxMxQ + _psi1_exponent2 = np.log(1.-gamma[:,None,:]) - (np.square(Z[None,:,:])/lengthscale2)/2. # NxMxQ _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM - _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ - _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ - _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ _psi1 = variance * np.exp(_psi1_exp_sum) # NxM - _dpsi1_dvariance = _psi1 / variance # NxM - _dpsi1_dgamma = _psi1_q * (_psi1_exp_dist_sq/_psi1_denom_sqrt-_psi1_exp_Z) # NxMxQ - _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ - _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ - _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ - _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ - return _psi1, _dpsi1_dvariance, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _dpsi1_dZ, _dpsi1_dlengthscale + return _psi1 -@Cache_this(limit=1) def _psi2computations(variance, lengthscale, Z, mu, S, gamma): """ Z - MxQ @@ -66,19 +62,14 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma): S - NxQ gamma - NxQ """ - # here are the "statistics" for psi1 and psi2 + # here are the "statistics" for psi2 # Produced intermediate results: - # _psi2 NxMxM - # _psi2_dvariance NxMxM - # _psi2_dlengthscale NxMxMxQ - # _psi2_dZ NxMxMxQ - # _psi2_dgamma NxMxMxQ - # _psi2_dmu NxMxMxQ - # _psi2_dS NxMxMxQ + # _psi2 MxM lengthscale2 = np.square(lengthscale) - _psi2_Zhat, _psi2_Zdist = _Z_distances(Z) + _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q + _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ @@ -93,15 +84,130 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma): _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM - _psi2_q = np.square(variance) * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ + _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM + + return _psi2 + +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + ARD = (len(lengthscale)!=1) + + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1, dgamma_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2, dgamma_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + + dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 + + dL_dlengscale = dl_psi1 + dl_psi2 + if not ARD: + dL_dlengscale = dL_dlengscale.sum() + + dL_dgamma = dgamma_psi1 + dgamma_psi2 + dL_dmu = dmu_psi1 + dmu_psi2 + dL_dS = dS_psi1 + dS_psi2 + dL_dZ = dZ_psi1 + dZ_psi2 + + return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma + +def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma): + """ + dL_dpsi1 - NxM + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 + # Produced intermediate results: dL_dparams w.r.t. psi1 + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + + # psi1 + _psi1_denom = S / lengthscale2 + 1. # NxQ + _psi1_denom_sqrt = np.sqrt(_psi1_denom) #NxQ + _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ + _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom[:,None,:]) # NxMxQ + _psi1_common = gamma / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #NxQ + _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom[:, None,:])) # NxMxQ + _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ + _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) + _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ + _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM + _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ + _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ + _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ + _psi1 = variance * np.exp(_psi1_exp_sum) # NxM + _dL_dvariance = np.einsum('nm,nm->',dL_dpsi1, _psi1)/variance # 1 + _dL_dgamma = np.einsum('nm,nmq,nmq->nq',dL_dpsi1, _psi1_q, (_psi1_exp_dist_sq/_psi1_denom_sqrt[:,None,:]-_psi1_exp_Z)) # NxQ + _dL_dmu = np.einsum('nm, nmq, nmq, nmq, nq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_dist,_psi1_common) # NxQ + _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2. # NxQ + _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z)) + _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z)) + +# _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ +# _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ +# _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ +# _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ + + return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma + +def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + dL_dpsi2 - MxM + """ + # here are the "statistics" for psi2 + # Produced the derivatives w.r.t. psi2: + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + + _psi2_Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q + _psi2_Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q + _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q + _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ + + # psi2 + _psi2_denom = 2.*S / lengthscale2 + 1. # NxQ + _psi2_denom_sqrt = np.sqrt(_psi2_denom) + _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q + _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom[:,None,None,:]) + _psi2_common = gamma/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # NxQ + _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom[:,None,None,:])+np.log(gamma[:,None,None,:]) #N,M,M,Q + _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ + _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) + _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) + _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM + _psi2_q = variance*variance * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ - _psi2 = np.square(variance) * np.exp(_psi2_exp_sum) # N,M,M - _dpsi2_dvariance = 2. * _psi2/variance # NxMxM - _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ - _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ - _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ - _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ - _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ + _psi2 = variance*variance * (np.exp(_psi2_exp_sum).sum(axis=0)) # MxM + _dL_dvariance = np.einsum('mo,mo->',dL_dpsi2,_psi2)*2./variance + _dL_dgamma = np.einsum('mo,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,(_psi2_exp_dist_sq/_psi2_denom_sqrt[:,None,None,:] - _psi2_exp_Z)) + _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq) + _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq) + _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z)) +# print _psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq #+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) + _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)) + + +# _dpsi2_dvariance = 2. * _psi2/variance # NxMxM +# _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ +# _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ +# _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ +# _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ +# _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ - return _psi2, _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ, _dpsi2_dlengthscale + return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index e0071fb9..5944e765 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -41,9 +41,11 @@ class RBF(Stationary): #---------------------------------------# def psi0(self, Z, variational_posterior): - if self.useGPU: - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + if self.useGPU: return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0] + else: + return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0] else: return self.Kdiag(variational_posterior.mean) @@ -52,7 +54,7 @@ class RBF(Stationary): if self.useGPU: return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1] else: - psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1] else: _, _, _, psi1 = self._psi1computations(Z, variational_posterior) return psi1 @@ -62,7 +64,7 @@ class RBF(Stationary): if self.useGPU: return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2] else: - psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2] else: _, _, _, _, psi2 = self._psi2computations(Z, variational_posterior) return psi2 @@ -73,26 +75,30 @@ class RBF(Stationary): if self.useGPU: self.psicomp.update_gradients_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) else: +# dL_dvar, dL_dlengscale, dL_dZ, dL_dgamma, dL_dmu, dL_dS = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) + dL_dvar, dL_dlengscale, _, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) + self.variance.gradient = dL_dvar + self.lengthscale.gradient = dL_dlengscale - _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - - #contributions from psi0: - self.variance.gradient = np.sum(dL_dpsi0) - - #from psi1 - self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance) - if self.ARD: - self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) - else: - self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum() - - #from psi2 - self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum() - if self.ARD: - self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) - else: - self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum() +# _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) +# _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) +# +# #contributions from psi0: +# self.variance.gradient = np.sum(dL_dpsi0) +# +# #from psi1 +# self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance) +# if self.ARD: +# self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) +# else: +# self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum() +# +# #from psi2 +# self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum() +# if self.ARD: +# self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) +# else: +# self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum() elif isinstance(variational_posterior, variational.NormalPosterior): l2 = self.lengthscale**2 @@ -125,22 +131,25 @@ class RBF(Stationary): else: raise ValueError, "unknown distriubtion received for psi-statistics" - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): if self.useGPU: return self.psicomp.gradients_Z_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) else: - _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - - #psi1 - grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0) - - #psi2 - grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1) - - return grad + _, _, dL_dZ, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) + return dL_dZ + +# _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) +# _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) +# +# #psi1 +# grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0) +# +# #psi2 +# grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1) +# +# return grad elif isinstance(variational_posterior, variational.NormalPosterior): l2 = self.lengthscale **2 @@ -166,26 +175,29 @@ class RBF(Stationary): if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): if self.useGPU: return self.psicomp.gradients_qX_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) - else: - ndata = variational_posterior.mean.shape[0] - - _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - - #psi1 - grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1) - grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1) - grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1) - - #psi2 - grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1) - grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1) - grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1) - - if self.group_spike_prob: - grad_gamma[:] = grad_gamma.mean(axis=0) - - return grad_mu, grad_S, grad_gamma + else: + _, _, _, dL_dmu, dL_dS, dL_dgamma = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) + return dL_dmu, dL_dS, dL_dgamma + +# ndata = variational_posterior.mean.shape[0] +# +# _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) +# _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) +# +# #psi1 +# grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1) +# grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1) +# grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1) +# +# #psi2 +# grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1) +# grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1) +# grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1) +# +# if self.group_spike_prob: +# grad_gamma[:] = grad_gamma.mean(axis=0) +# +# return grad_mu, grad_S, grad_gamma elif isinstance(variational_posterior, variational.NormalPosterior): diff --git a/GPy/kern/_src/static.py b/GPy/kern/_src/static.py index d853b4e0..68884937 100644 --- a/GPy/kern/_src/static.py +++ b/GPy/kern/_src/static.py @@ -25,7 +25,7 @@ class Static(Kern): def gradients_X_diag(self, dL_dKdiag, X): return np.zeros(X.shape) - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): + def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): return np.zeros(Z.shape) def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py index 57be302a..dba12700 100644 --- a/GPy/models/ss_gplvm.py +++ b/GPy/models/ss_gplvm.py @@ -11,7 +11,7 @@ from ..likelihoods import Gaussian from ..inference.optimization import SCG from ..util import linalg from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior -from ..inference.latent_function_inference.var_dtc_parallel import update_gradients +from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU @@ -41,7 +41,7 @@ class SSGPLVM(SparseGP): if X_variance is None: # The variance of the variational approximation (S) X_variance = np.random.uniform(0,.1,X.shape) - gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation + gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim) if group_spike: @@ -60,12 +60,15 @@ class SSGPLVM(SparseGP): pi = np.empty((input_dim)) pi[:] = 0.5 self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b + + X = np.asfortranarray(X) + X_variance = np.asfortranarray(X_variance) + gamma = np.asfortranarray(gamma) X = SpikeAndSlabPosterior(X, X_variance, gamma) if group_spike: kernel.group_spike_prob = True self.variational_prior.group_spike_prob = True - SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs) self.add_parameter(self.X, index=0) @@ -76,7 +79,7 @@ class SSGPLVM(SparseGP): X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad def parameters_changed(self): - if isinstance(self.inference_method, VarDTC_GPU): + if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch): update_gradients(self) return diff --git a/GPy/util/initialization.py b/GPy/util/initialization.py index 22e63b6b..dd14c9a5 100644 --- a/GPy/util/initialization.py +++ b/GPy/util/initialization.py @@ -8,7 +8,7 @@ import numpy as np from GPy.util.pca import pca def initialize_latent(init, input_dim, Y): - Xr = np.random.randn(Y.shape[0], input_dim) + Xr = np.asfortranarray(np.random.randn(Y.shape[0], input_dim)) if init == 'PCA': p = pca(Y) PC = p.project(Y, min(input_dim, Y.shape[1])) diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py index b204f813..661a2b47 100644 --- a/GPy/util/linalg.py +++ b/GPy/util/linalg.py @@ -123,7 +123,7 @@ def dtrtrs(A, B, lower=1, trans=0, unitdiag=0): :returns: """ - A = force_F_ordered(A) + A = np.asfortranarray(A) #Note: B does not seem to need to be F ordered! return lapack.dtrtrs(A, B, lower=lower, trans=trans, unitdiag=unitdiag)