diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py index 7fc76d07..78c0d2b9 100644 --- a/GPy/core/parameterization/index_operations.py +++ b/GPy/core/parameterization/index_operations.py @@ -22,15 +22,44 @@ def extract_properties_to_index(index, props): class ParameterIndexOperations(object): - ''' - Index operations for storing param index _properties - This class enables index with slices retrieved from object.__getitem__ calls. - Adding an index will add the selected indexes by the slice of an indexarray - indexing a shape shaped array to the flattened index array. Remove will - remove the selected slice indices from the flattened array. - You can give an offset to set an offset for the given indices in the - index array, for multi-param handling. - ''' + """ + This object wraps a dictionary, whos keys are _operations_ that we'd like + to apply to a parameter array, and whose values are np integer arrays which + index the parameter array appropriately. + + A model instance will contain one instance of this class for each thing + that needs indexing (i.e. constraints, ties and priors). Parameters within + the model constain instances of the ParameterIndexOperationsView class, + which can map from a 'local' index (starting 0) to this global index. + + Here's an illustration: + + #======================================================================= + model : 0 1 2 3 4 5 6 7 8 9 + key1: 4 5 + key2: 7 8 + + param1: 0 1 2 3 4 5 + key1: 2 3 + key2: 5 + + param2: 0 1 2 3 4 + key1: 0 + key2: 2 3 + #======================================================================= + + The views of this global index have a subset of the keys in this global + (model) index. + + Adding a new key (e.g. a constraint) to a view will cause the view to pass + the new key to the global index, along with the local index and an offset. + This global index then stores the key and the appropriate global index + (which can be seen by the view). + + See also: + ParameterIndexOperationsView + + """ _offset = 0 def __init__(self, constraints=None): self._properties = IntArrayDict() diff --git a/GPy/core/parameterization/ties_and_remappings.py b/GPy/core/parameterization/ties_and_remappings.py new file mode 100644 index 00000000..da46acaa --- /dev/null +++ b/GPy/core/parameterization/ties_and_remappings.py @@ -0,0 +1,91 @@ +# Copyright (c) 2014, James Hensman, Max Zwiessele +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from parameterized import Parameterized +from param import Param + +class Remapping(Parameterized): + def mapping(self): + """ + The return value of this function gives the values which the re-mapped + parameters should take. Implement in sub-classes. + """ + raise NotImplementedError + + def callback(self): + raise NotImplementedError + + def __str__(self): + return self.name + + def parameters_changed(self): + #ensure all out parameters have the correct value, as specified by our mapping + index = self._highest_parent_.constraints[self] + self._highest_parent_.param_array[index] = self.mapping() + [p.notify_observers(which=self) for p in self.tied_parameters] + +class Fix(Remapping): + pass + + + + +class Tie(Remapping): + def __init__(self, value, name): + super(Tie, self).__init__(name) + self.tied_parameters = [] + self.value = Param('val', value) + self.add_parameter(self.value) + + def add_tied_parameter(self, p): + self.tied_parameters.append(p) + p.add_observer(self, self.callback) + self.parameters_changed() + + def callback(self, param=None, which=None): + """ + This gets called whenever any of the tied parameters changes. we spend + considerable effort working out what has changed and to what value. + Then we store that value in self.value, and broadcast it everywhere + with parameters_changed. + """ + if which is self:return + index = self._highest_parent_.constraints[self] + if len(index)==0: + return # nothing to tie together, this tie exists without any tied parameters + self.collate_gradient() + vals = self._highest_parent_.param_array[index] + uvals = np.unique(vals) + if len(uvals)==1: + #all of the tied things are at the same value + if np.all(self.value==uvals[0]): + return # DO NOT DO ANY CHANGES IF THE TIED PART IS NOT CHANGED! + self.value[...] = uvals[0] + elif len(uvals)==2: + #only *one* of the tied things has changed. it must be different to self.value + newval = uvals[uvals != self.value*1] + self.value[...] = newval + else: + #more than one of the tied things changed. panic. + raise ValueError, "something is wrong with the tieing" + def parameters_changed(self): + #ensure all out parameters have the correct value, as specified by our mapping + index = self._highest_parent_.constraints[self] + if np.all(self._highest_parent_.param_array[index]==self.value): + return # STOP TRIGGER THE UPDATE LOOP MULTIPLE TIMES!!! + self._highest_parent_.param_array[index] = self.mapping() + [p.notify_observers(which=self) for p in self.tied_parameters] + self.collate_gradient() + + def mapping(self): + return self.value + + def collate_gradient(self): + index = self._highest_parent_.constraints[self] + self.value.gradient = np.sum(self._highest_parent_.gradient[index]) + + + + + diff --git a/GPy/core/parameterization/transformations.py b/GPy/core/parameterization/transformations.py index 506d80cd..df31f09f 100644 --- a/GPy/core/parameterization/transformations.py +++ b/GPy/core/parameterization/transformations.py @@ -195,6 +195,9 @@ class Logistic(Transformation): self.lower, self.upper = float(lower), float(upper) self.difference = self.upper - self.lower def f(self, x): + if (x<-300.).any(): + x = x.copy() + x[x<-300.] = -300. return self.lower + self.difference / (1. + np.exp(-x)) def finv(self, f): return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf)) diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py index fa32f642..cf8d3067 100644 --- a/GPy/core/parameterization/variational.py +++ b/GPy/core/parameterization/variational.py @@ -34,36 +34,38 @@ class NormalPrior(VariationalPrior): variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5 class SpikeAndSlabPrior(VariationalPrior): - def __init__(self, pi, variance = 1.0, name='SpikeAndSlabPrior', **kw): + def __init__(self, pi=None, learnPi=False, variance = 1.0, name='SpikeAndSlabPrior', **kw): super(VariationalPrior, self).__init__(name=name, **kw) - assert variance==1.0, "Not Implemented!" self.pi = Param('pi', pi, Logistic(1e-10,1.-1e-10)) self.variance = Param('variance',variance) - self.add_parameters(self.pi) - self.group_spike_prob = False + self.learnPi = learnPi + if learnPi: + self.add_parameters(self.pi) def KL_divergence(self, variational_posterior): mu = variational_posterior.mean S = variational_posterior.variance gamma = variational_posterior.binary_prob - var_mean = np.square(mu) - var_S = (S - np.log(S)) + var_mean = np.square(mu)/self.variance + var_S = (S/self.variance - np.log(S)) var_gamma = (gamma*np.log(gamma/self.pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-self.pi))).sum() - return var_gamma+ 0.5 * (gamma* (var_mean + var_S -1)).sum() + return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2. def update_gradients_KL(self, variational_posterior): mu = variational_posterior.mean S = variational_posterior.variance gamma = variational_posterior.binary_prob - if self.group_spike_prob: - gamma_grad = np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2. - gamma.gradient -= gamma_grad.mean(axis=0) - else: - gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2. - mu.gradient -= gamma*mu - S.gradient -= (1. - (1. / (S))) * gamma /2. - self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0) + gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2. + mu.gradient -= gamma*mu/self.variance + S.gradient -= (1./self.variance - 1./S) * gamma /2. + if self.learnPi: + if len(self.pi)==1: + self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum() + elif len(self.pi.shape)==1: + self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0) + else: + self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)) class VariationalPosterior(Parameterized): def __init__(self, means=None, variances=None, name='latent space', *a, **kw): @@ -155,6 +157,9 @@ class SpikeAndSlabPosterior(VariationalPosterior): n.parameters[dc['mean']._parent_index_] = dc['mean'] n.parameters[dc['variance']._parent_index_] = dc['variance'] n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob'] + n._gradient_array_ = None + oversize = self.size - self.mean.size - self.variance.size + n.size = n.mean.size + n.variance.size + oversize n.ndim = n.mean.ndim n.shape = n.mean.shape n.num_data = n.mean.shape[0] @@ -163,7 +168,7 @@ class SpikeAndSlabPosterior(VariationalPosterior): else: return super(VariationalPrior, self).__getitem__(s) - def plot(self, *args): + def plot(self, *args, **kwargs): """ Plot latent space X in 1D: @@ -172,4 +177,4 @@ class SpikeAndSlabPosterior(VariationalPosterior): import sys assert "matplotlib" in sys.modules, "matplotlib package has not been imported." from ...plotting.matplot_dep import variational_plots - return variational_plots.plot_SpikeSlab(self,*args) + return variational_plots.plot_SpikeSlab(self,*args, **kwargs) diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index 803aa29f..cdf39a9b 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -118,5 +118,3 @@ class SparseGP(GP): psi2 = kern.psi2(self.Z, Xnew) var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1) return mu, var - - diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index 1932691c..842d0bf8 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -183,6 +183,33 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, plt.close(fig) return m +def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k): + import GPy + from matplotlib import pyplot as plt + from ..util.misc import param_to_array + import numpy as np + + _np.random.seed(0) + data = GPy.util.datasets.oil() + + kernel = GPy.kern.RBF(Q, 1., 1./_np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2)) + Y = data['X'][:N] + m = GPy.models.SSGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k) + m.data_labels = data['Y'][:N].argmax(axis=1) + + if optimize: + m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05) + + if plot: + fig, (latent_axes, sense_axes) = plt.subplots(1, 2) + m.plot_latent(ax=latent_axes, labels=m.data_labels) + data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:])) + lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable + m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels) + raw_input('Press enter to finish') + plt.close(fig) + return m + def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False): _np.random.seed(1234) @@ -288,6 +315,31 @@ def bgplvm_simulation(optimize=True, verbose=1, m.kern.plot_ARD('BGPLVM Simulation ARD Parameters') return m +def ssgplvm_simulation(optimize=True, verbose=1, + plot=True, plot_sim=False, + max_iters=2e4, useGPU=False + ): + from GPy import kern + from GPy.models import SSGPLVM + + D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9 + _, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim) + Y = Ylist[0] + k = kern.Linear(Q, ARD=True, useGPU=useGPU)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q) + #k = kern.RBF(Q, ARD=True, lengthscale=10.) + m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k) + m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape) + m.likelihood.variance = .1 + + if optimize: + print "Optimizing model:" + m.optimize('scg', messages=verbose, max_iters=max_iters, + gtol=.05) + if plot: + m.X.plot("SSGPLVM Latent Space 1D") + m.kern.plot_ARD('SSGPLVM Simulation ARD Parameters') + return m + def bgplvm_simulation_missing_data(optimize=True, verbose=1, plot=True, plot_sim=False, max_iters=2e4, diff --git a/GPy/inference/latent_function_inference/posterior.py b/GPy/inference/latent_function_inference/posterior.py index 70b0e0ea..66c68261 100644 --- a/GPy/inference/latent_function_inference/posterior.py +++ b/GPy/inference/latent_function_inference/posterior.py @@ -153,9 +153,14 @@ class Posterior(object): $$ """ if self._woodbury_inv is None: - self._woodbury_inv, _ = dpotri(self.woodbury_chol, lower=1) - #self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1) - symmetrify(self._woodbury_inv) + if self._woodbury_chol is not None: + self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1) + #self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1) + symmetrify(self._woodbury_inv) + elif self._covariance is not None: + B = self._K - self._covariance + tmp, _ = dpotrs(self.K_chol, B) + self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T) return self._woodbury_inv @property diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py index b3a09f8f..4f21bc29 100644 --- a/GPy/inference/latent_function_inference/var_dtc.py +++ b/GPy/inference/latent_function_inference/var_dtc.py @@ -64,20 +64,9 @@ class VarDTC(LatentFunctionInference): return Y * prec # TODO chache this, and make it effective def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): - if isinstance(X, VariationalPosterior): - uncertain_inputs = True - psi0 = kern.psi0(Z, X) - psi1 = kern.psi1(Z, X) - psi2 = kern.psi2(Z, X) - else: - uncertain_inputs = False - psi0 = kern.Kdiag(X) - psi1 = kern.K(X, Z) - psi2 = None - - #see whether we're using variational uncertain inputs _, output_dim = Y.shape + uncertain_inputs = isinstance(X, VariationalPosterior) #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6) @@ -98,23 +87,21 @@ class VarDTC(LatentFunctionInference): diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) - # The rather complex computations of A + + # The rather complex computations of A, and the psi stats if uncertain_inputs: + psi0 = kern.psi0(Z, X) + psi1 = kern.psi1(Z, X) if het_noise: - psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0) + psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0) else: - psi2_beta = psi2.sum(0) * beta - #if 0: - # evals, evecs = linalg.eigh(psi2_beta) - # clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable - # if not np.array_equal(evals, clipped_evals): - # pass # print evals - # tmp = evecs * np.sqrt(clipped_evals) - # tmp = tmp.T - # no backsubstitution because of bound explosion on tr(A) if not... + psi2_beta = kern.psi2(Z,X) * beta LmInv = dtrtri(Lm) A = LmInv.dot(psi2_beta.dot(LmInv.T)) else: + psi0 = kern.Kdiag(X) + psi1 = kern.K(X, Z) + psi2 = None if het_noise: tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1))) else: @@ -151,7 +138,7 @@ class VarDTC(LatentFunctionInference): log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise, psi0, A, LB, trYYT, data_fit, VVT_factor) - #put the gradients in the right places + #noise derivatives dL_dR = _compute_dL_dR(likelihood, het_noise, uncertain_inputs, LB, _LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A, @@ -160,6 +147,7 @@ class VarDTC(LatentFunctionInference): dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata) + #put the gradients in the right places if uncertain_inputs: grad_dict = {'dL_dKmm': dL_dKmm, 'dL_dpsi0':dL_dpsi0, @@ -403,10 +391,7 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C dL_dpsi2 = None else: dL_dpsi2 = beta * dL_dpsi2_beta - if uncertain_inputs: - # repeat for each of the N psi_2 matrices - dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0) - else: + if not uncertain_inputs: # subsume back into psi1 (==Kmn) dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2) dL_dpsi2 = None diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index d346d01f..3bd5c347 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -16,7 +16,7 @@ try: import scikits.cuda.linalg as culinalg import pycuda.gpuarray as gpuarray from scikits.cuda import cublas - from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod + from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod, traceDot except: pass @@ -66,18 +66,13 @@ class VarDTC_GPU(LatentFunctionInference): 'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'), 'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN 'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN - 'psi2_t_gpu' :gpuarray.empty((num_inducing*num_inducing*self.batchsize),np.float64,order='F'), # inference_minibatch 'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), 'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), - 'dL_dpsi2_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), - 'dL_dthetaL_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), - 'betapsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), - 'thetaL_t_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), - 'betaYT2_gpu' :gpuarray.empty((output_dim,self.batchsize),np.float64,order='F'), + 'dL_dpsi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'psi0p_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), 'psi1p_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), - 'psi2p_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), + 'psi2p_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), } self.gpuCache['ones_gpu'].fill(1.0) @@ -126,6 +121,89 @@ class VarDTC_GPU(LatentFunctionInference): else: return jitchol(tdot(Y)) + def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, het_noise): + num_inducing, input_dim = Z.shape[0], Z.shape[1] + num_data, output_dim = Y.shape + trYYT = self._trYYT + psi1Y_gpu = self.gpuCache['psi1Y_gpu'] + psi2_gpu = self.gpuCache['psi2_gpu'] + beta_gpu = self.gpuCache['beta_gpu'] + YT_gpu = self.gpuCache['YT_gpu'] + betaYT_gpu = self.gpuCache['betaYT_gpu'] + + beta_gpu.fill(beta) + betaYT_gpu.fill(0.) + cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1) + YRY_full = trYYT*beta + + if kern.useGPU: + psi1Y_gpu.fill(0.) + psi2_gpu.fill(0.) + psi0_full = 0 + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + ndata = n_end - n_start + X_slice = X[n_start:n_end] + betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1p_gpu = kern.psi1(Z, X_slice) + psi2p_gpu = kern.psi2(Z, X_slice) + else: + psi0 = kern.Kdiag(X_slice) + psi1p_gpu = kern.K(X_slice, Z) + + cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) + + psi0_full += psi0.sum() + + if uncertain_inputs: + sum_axis(psi2_gpu,psi2p_gpu,1,1) + else: + cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) + + psi0_full *= beta + if uncertain_inputs: + cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1) + + else: + psi2_full = np.zeros((num_inducing,num_inducing)) + psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM + psi0_full = 0. + YRY_full = 0. + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + Y_slice = Y[n_start:n_end] + X_slice = X[n_start:n_end] + + if het_noise: + b = beta[n_start] + YRY_full += np.inner(Y_slice, Y_slice)*b + else: + b = beta + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2_full += kern.psi2(Z, X_slice)*b + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + psi2_full += np.dot(psi1.T,psi1)*b + + psi0_full += psi0.sum()*b + psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM + + if not het_noise: + YRY_full = trYYT*beta + psi1Y_gpu.set(psi1Y_full) + psi2_gpu.set(psi2_full) + + return psi0_full, YRY_full + def inference_likelihood(self, kern, X, Z, likelihood, Y): """ The first phase of inference: @@ -137,6 +215,12 @@ class VarDTC_GPU(LatentFunctionInference): num_inducing, input_dim = Z.shape[0], Z.shape[1] num_data, output_dim = Y.shape + #see whether we've got a different noise variance for each datum + beta = 1./np.fmax(likelihood.variance, 1e-6) + het_noise = beta.size > 1 + if het_noise: + self.batchsize=0 + self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y) if isinstance(X, VariationalPosterior): @@ -144,123 +228,10 @@ class VarDTC_GPU(LatentFunctionInference): else: uncertain_inputs = False - #see whether we've got a different noise variance for each datum - beta = 1./np.fmax(likelihood.variance, 1e-6) - het_noise = beta.size > 1 - trYYT = self._trYYT - psi1Y_gpu = self.gpuCache['psi1Y_gpu'] psi2_gpu = self.gpuCache['psi2_gpu'] - beta_gpu = self.gpuCache['beta_gpu'] - YT_gpu = self.gpuCache['YT_gpu'] - betaYT_gpu = self.gpuCache['betaYT_gpu'] - psi2_t_gpu = self.gpuCache['psi2_t_gpu'] - if het_noise: - beta_gpu.set(np.asfortranarray(beta)) - mul_bcast(betaYT_gpu,beta_gpu,YT_gpu,beta_gpu.size) - YRY_full = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, betaYT_gpu.gpudata, 1, YT_gpu.gpudata, 1) - else: - beta_gpu.fill(beta) - betaYT_gpu.fill(0.) - cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1) - YRY_full = trYYT*beta - - if kern.useGPU: - psi1Y_gpu.fill(0.) - psi2_gpu.fill(0.) - psi0_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - ndata = n_end - n_start - X_slice = X[n_start:n_end] - beta_gpu_slice = beta_gpu[n_start:n_end] - betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - if ndata==self.batchsize: - psi2_t_gpu_slice = psi2_t_gpu - else: - psi2_t_gpu_slice = psi2_t_gpu[:num_inducing*num_inducing*ndata] - if uncertain_inputs: - psi0p_gpu = kern.psi0(Z, X_slice) - psi1p_gpu = kern.psi1(Z, X_slice) - psi2p_gpu = kern.psi2(Z, X_slice) - else: - psi0p_gpu = kern.Kdiag(X_slice) - psi1p_gpu = kern.K(X_slice, Z) - - cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) - - if het_noise: - psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1) - else: - psi0_full += gpuarray.sum(psi0p_gpu).get() - - if uncertain_inputs: - if het_noise: - mul_bcast(psi2_t_gpu_slice,beta_gpu_slice,psi2p_gpu,beta_gpu_slice.size) - sum_axis(psi2_gpu,psi2_t_gpu_slice,1,ndata) - else: - sum_axis(psi2_gpu,psi2p_gpu,1,ndata) - else: - if het_noise: - psi1_t_gpu = psi2_t_gpu_slice[:,num_inducing*ndata] - mul_bcast(psi1_t_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) - cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, 1.0, psi1p_gpu.gpudata, ndata, psi1_t_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) - else: - cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) - - if not het_noise: - psi0_full *= beta - if uncertain_inputs: - cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1) - - else: - psi2_full = np.zeros((num_inducing,num_inducing),order='F') - psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD - psi0_full = 0 -# YRY_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - Y_slice = Y[n_start:n_end] - X_slice = X[n_start:n_end] - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - - if het_noise: - beta_slice = beta[n_start:n_end] - psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD -# YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() - else: - psi0_full += psi0.sum() - psi1Y_full += np.dot(psi1.T,Y_slice) # MxD - - if uncertain_inputs: - if het_noise: - psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2) - else: - psi2_full += psi2.sum(axis=0) - else: - if het_noise: - psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1) - else: - psi2_full += tdot(psi1.T) - - if not het_noise: - psi0_full *= beta - psi1Y_full *= beta - psi2_full *= beta -# YRY_full = trYYT*beta - - psi1Y_gpu.set(psi1Y_full) - psi2_gpu.set(psi2_full) + psi0_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs, het_noise) #====================================================================== # Compute Common Components @@ -373,6 +344,16 @@ class VarDTC_GPU(LatentFunctionInference): post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get()) + #====================================================================== + # Compute dL_dthetaL for uncertian input and non-heter noise + #====================================================================== + + if not het_noise: + dL_dthetaL = (YRY_full + output_dim*psi0_full - num_data*output_dim)/-2. + dL_dthetaL += cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1) + dL_dthetaL += cublas.cublasDdot(self.cublas_handle,v_gpu.size, v_gpu.gpudata,1,psi1Y_gpu.gpudata,1) + self.midRes['dL_dthetaL'] = -beta*dL_dthetaL + return logL, dL_dKmm_gpu.get(), post def inference_minibatch(self, kern, X, Z, likelihood, Y): @@ -404,26 +385,26 @@ class VarDTC_GPU(LatentFunctionInference): nSlice = n_end-n_start X_slice = X[n_start:n_end] + if het_noise: + beta = beta[n_start] # nSlice==1 if kern.useGPU: - if uncertain_inputs: - psi0p_gpu = kern.psi0(Z, X_slice) - psi1p_gpu = kern.psi1(Z, X_slice) - psi2p_gpu = kern.psi2(Z, X_slice) - else: + if not uncertain_inputs: psi0p_gpu = kern.Kdiag(X_slice) psi1p_gpu = kern.K(X_slice, Z) psi2p_gpu = self.gpuCache['psi2p_gpu'] - if psi2p_gpu.shape[0] > nSlice: - psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - else: - if uncertain_inputs: + elif het_noise: + psi0p_gpu = kern.psi0(Z, X_slice) + psi1p_gpu = kern.psi1(Z, X_slice) + psi2p_gpu = kern.psi2(Z, X_slice) + elif not uncertain_inputs or het_noise: + if not uncertain_inputs: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + elif het_noise: psi0 = kern.psi0(Z, X_slice) psi1 = kern.psi1(Z, X_slice) psi2 = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) psi0p_gpu = self.gpuCache['psi0p_gpu'] psi1p_gpu = self.gpuCache['psi1p_gpu'] @@ -431,91 +412,46 @@ class VarDTC_GPU(LatentFunctionInference): if psi0p_gpu.shape[0] > nSlice: psi0p_gpu = psi0p_gpu[:nSlice] psi1p_gpu = psi1p_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) psi0p_gpu.set(np.asfortranarray(psi0)) psi1p_gpu.set(np.asfortranarray(psi1)) if uncertain_inputs: psi2p_gpu.set(np.asfortranarray(psi2)) - - #====================================================================== - # Prepare gpu memory - #====================================================================== - - dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] - v_gpu = self.gpuCache['v_gpu'] - betaYT_gpu = self.gpuCache['betaYT_gpu'] - beta_gpu = self.gpuCache['beta_gpu'] - dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu'] - dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] - dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] - dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu'] - psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - betapsi1_gpu = self.gpuCache['betapsi1_gpu'] - thetaL_t_gpu = self.gpuCache['thetaL_t_gpu'] - betaYT2_gpu = self.gpuCache['betaYT2_gpu'] - - betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - beta_gpu_slice = beta_gpu[n_start:n_end] - - # Adjust to the batch size - if dL_dpsi0_gpu.shape[0] > nSlice: - betaYT2_gpu = betaYT2_gpu[:,:nSlice] - dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice] - dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - dL_dpsi2_gpu = dL_dpsi2_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - dL_dthetaL_gpu = dL_dthetaL_gpu.ravel()[:nSlice] - psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice] - betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - - mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) - + #====================================================================== # Compute dL_dpsi #====================================================================== + + dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] + v_gpu = self.gpuCache['v_gpu'] + dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu'] + dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] + dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] + betaYT_gpu = self.gpuCache['betaYT_gpu'] + betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - dL_dpsi0_gpu.fill(0.) - cublas.cublasDaxpy(self.cublas_handle, dL_dpsi0_gpu.size, output_dim/(-2.), beta_gpu_slice.gpudata, 1, dL_dpsi0_gpu.gpudata, 1) + # Adjust to the batch size + if dL_dpsi0_gpu.shape[0] > nSlice: + dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice] + dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) + + dL_dpsi0_gpu.fill(-output_dim *beta/2.) cublas.cublasDgemm(self.cublas_handle, 'T', 'T', nSlice, num_inducing, output_dim, 1.0, betaYT_gpu_slice.gpudata, output_dim, v_gpu.gpudata, num_inducing, 0., dL_dpsi1_gpu.gpudata, nSlice) if uncertain_inputs: - outer_prod(dL_dpsi2_gpu,beta_gpu_slice,dL_dpsi2R_gpu,beta_gpu_slice.size) + cublas.cublasDcopy(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, dL_dpsi2_gpu.gpudata, 1) + cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.size, beta, dL_dpsi2_gpu.gpudata, 1) else: - cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice) - + cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, beta, psi1p_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice) + #====================================================================== # Compute dL_dthetaL #====================================================================== - - if not uncertain_inputs: - join_prod(psi2p_gpu,psi1p_gpu,psi1p_gpu,nSlice,num_inducing) - - mul_bcast_first(psi2R_gpu,dL_dpsi2R_gpu,psi2p_gpu,nSlice) - - - dL_dthetaL_gpu.fill(0.) - - cublas.cublasDcopy(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata, 1, betaYT2_gpu.gpudata, 1) - mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT2_gpu,betaYT2_gpu.size) - cublas.cublasDscal(self.cublas_handle, betaYT2_gpu.size, 0.5, betaYT2_gpu.gpudata, 1) - sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) - - cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/(-2.0), beta_gpu_slice.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) - cublas.cublasDcopy(self.cublas_handle, beta_gpu_slice.size, beta_gpu_slice.gpudata, 1, thetaL_t_gpu.gpudata, 1) - mul_bcast(thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu.size) - mul_bcast(thetaL_t_gpu,thetaL_t_gpu,psi0p_gpu,thetaL_t_gpu.size) - cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/2.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) - - thetaL_t_gpu.fill(0.) - sum_axis(thetaL_t_gpu, psi2R_gpu, nSlice, num_inducing*num_inducing) - mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size) - mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size) - cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) - - cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, -1.0, v_gpu.gpudata, num_inducing, betapsi1_gpu.gpudata, nSlice, 0.0, betaYT2_gpu.gpudata, output_dim) - mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size) - sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) + if het_noise: + betaY = betaYT_gpu_slice.get() + dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0p_gpu.get())-output_dim*beta)/2. + dL_dthetaL += -beta*beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2p_gpu.gpudata,1) + dL_dthetaL += -beta*(betaY*np.dot(psi1p_gpu.get(),v_gpu.get())).sum(axis=-1) if kern.useGPU: dL_dpsi0 = dL_dpsi0_gpu @@ -528,10 +464,11 @@ class VarDTC_GPU(LatentFunctionInference): dL_dpsi2 = dL_dpsi2_gpu else: dL_dpsi2 = dL_dpsi2_gpu.get() - if het_noise: - dL_dthetaL = dL_dthetaL_gpu.get() - else: - dL_dthetaL = gpuarray.sum(dL_dthetaL_gpu).get() + if not het_noise: + if isEnd: + dL_dthetaL = self.midRes['dL_dthetaL'] + else: + dL_dthetaL = 0. if uncertain_inputs: grad_dict = {'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, @@ -541,6 +478,6 @@ class VarDTC_GPU(LatentFunctionInference): grad_dict = {'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} - + return isEnd, (n_start,n_end), grad_dict diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py index e9a40cbb..03f5d478 100644 --- a/GPy/inference/latent_function_inference/var_dtc_parallel.py +++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py @@ -10,6 +10,11 @@ from ...util.misc import param_to_array from . import LatentFunctionInference log_2_pi = np.log(2*np.pi) +try: + from mpi4py import MPI +except: + pass + class VarDTC_minibatch(LatentFunctionInference): """ An object for inference when the likelihood is Gaussian, but we want to do sparse inference. @@ -21,22 +26,39 @@ class VarDTC_minibatch(LatentFunctionInference): """ const_jitter = 1e-6 - def __init__(self, batchsize, limit=1): - + def __init__(self, batchsize=None, limit=1, mpi_comm=None): + self.batchsize = batchsize - + self.mpi_comm = mpi_comm + self.limit = limit + # Cache functions from ...util.caching import Cacher self.get_trYYT = Cacher(self._get_trYYT, limit) self.get_YYTfactor = Cacher(self._get_YYTfactor, limit) - + self.midRes = {} self.batch_pos = 0 # the starting position of the current mini-batch + self.Y_speedup = False # Replace Y with the cholesky factor of YY.T, but the posterior inference will be wrong + + def __getstate__(self): + # has to be overridden, as Cacher objects cannot be pickled. + return self.batchsize, self.limit, self.Y_speedup + + def __setstate__(self, state): + # has to be overridden, as Cacher objects cannot be pickled. + self.batchsize, self.limit, self.Y_speedup = state + self.mpi_comm = None + self.midRes = {} + self.batch_pos = 0 + from ...util.caching import Cacher + self.get_trYYT = Cacher(self._get_trYYT, self.limit) + self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit) def set_limit(self, limit): self.get_trYYT.limit = limit self.get_YYTfactor.limit = limit - + def _get_trYYT(self, Y): return param_to_array(np.sum(np.square(Y))) @@ -51,88 +73,103 @@ class VarDTC_minibatch(LatentFunctionInference): return param_to_array(Y) else: return jitchol(tdot(Y)) + + def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs): + + het_noise = beta.size > 1 + trYYT = self.get_trYYT(Y) + if self.Y_speedup and not het_noise: + Y = self.get_YYTfactor(Y) + + num_inducing = Z.shape[0] + num_data, output_dim = Y.shape + if self.batchsize == None: + self.batchsize = num_data + + psi2_full = np.zeros((num_inducing,num_inducing)) + psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM + psi0_full = 0. + YRY_full = 0. + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + if (n_end-n_start)==num_data: + Y_slice = Y + X_slice = X + else: + Y_slice = Y[n_start:n_end] + X_slice = X[n_start:n_end] + + if het_noise: + b = beta[n_start] + YRY_full += np.inner(Y_slice, Y_slice)*b + else: + b = beta + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2_full += kern.psi2(Z, X_slice)*b + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + psi2_full += np.dot(psi1.T,psi1)*b + + psi0_full += psi0.sum()*b + psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM + + if not het_noise: + YRY_full = trYYT*beta + + if self.mpi_comm != None: + psi0_all = np.array(psi0_full) + psi1Y_all = psi1Y_full.copy() + psi2_all = psi2_full.copy() + YRY_all = np.array(YRY_full) + self.mpi_comm.Allreduce([psi0_full, MPI.DOUBLE], [psi0_all, MPI.DOUBLE]) + self.mpi_comm.Allreduce([psi1Y_full, MPI.DOUBLE], [psi1Y_all, MPI.DOUBLE]) + self.mpi_comm.Allreduce([psi2_full, MPI.DOUBLE], [psi2_all, MPI.DOUBLE]) + self.mpi_comm.Allreduce([YRY_full, MPI.DOUBLE], [YRY_all, MPI.DOUBLE]) + return psi0_all, psi1Y_all, psi2_all, YRY_all + + return psi0_full, psi1Y_full, psi2_full, YRY_full + def inference_likelihood(self, kern, X, Z, likelihood, Y): """ The first phase of inference: Compute: log-likelihood, dL_dKmm - + Cached intermediate results: Kmm, KmmInv, """ - - num_inducing = Z.shape[0] - num_data, output_dim = Y.shape + + num_data, output_dim = Y.shape + if self.mpi_comm != None: + num_data_all = np.array(num_data,dtype=np.int32) + self.mpi_comm.Allreduce([np.int32(num_data), MPI.INT], [num_data_all, MPI.INT]) + num_data = num_data_all if isinstance(X, VariationalPosterior): uncertain_inputs = True else: uncertain_inputs = False - + #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.variance, 1e-6) het_noise = beta.size > 1 if het_noise: self.batchsize = 1 - # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! - #self.YYTfactor = beta*self.get_YYTfactor(Y) - YYT_factor = Y - trYYT = self.get_trYYT(Y) - - psi2_full = np.zeros((num_inducing,num_inducing)) - psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM - psi0_full = 0 - YRY_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - - n_end = min(self.batchsize+n_start, num_data) - - Y_slice = YYT_factor[n_start:n_end] - X_slice = X[n_start:n_end] - - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - psi2 = None - - if het_noise: - beta_slice = beta[n_start:n_end] - psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(beta_slice*Y_slice.T,psi1) # DxM - YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() - else: - psi0_full += psi0.sum() - psi1Y_full += np.dot(Y_slice.T,psi1) # DxM - - if uncertain_inputs: - if het_noise: - psi2_full += beta_slice*psi2 - else: - psi2_full += psi2.sum(0) - else: - if het_noise: - psi2_full += beta_slice*np.outer(psi1,psi1) - else: - psi2_full += np.einsum('nm,jk->mk',psi1,psi1) - - if not het_noise: - psi0_full *= beta - psi1Y_full *= beta - psi2_full *= beta - YRY_full = trYYT*beta + psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) + #====================================================================== # Compute Common Components #====================================================================== - self.psi1Y = psi1Y_full + Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) Lm = jitchol(Kmm) - + Lambda = Kmm+psi2_full LL = jitchol(Lambda) b,_ = dtrtrs(LL, psi1Y_full.T) @@ -140,18 +177,18 @@ class VarDTC_minibatch(LatentFunctionInference): v,_ = dtrtrs(LL.T,b,lower=False) vvt = np.einsum('md,od->mo',v,v) LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') - + Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0] KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0] KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T - + dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2 - + # Cache intermediate results self.midRes['dL_dpsi2R'] = dL_dpsi2R self.midRes['v'] = v - + #====================================================================== # Compute log-likelihood #====================================================================== @@ -159,33 +196,36 @@ class VarDTC_minibatch(LatentFunctionInference): logL_R = -np.log(beta).sum() else: logL_R = -num_data*np.log(beta) - logL = ( - -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2. - -output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) - ) + logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) #====================================================================== # Compute dL_dKmm #====================================================================== - + dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== - -# phi_u_mean = np.dot(Kmm,v) -# LLInvKmm,_ = dtrtrs(LL,Kmm) -# # phi_u_var = np.einsum('ma,mb->ab',LLInvKmm,LLInvKmm) -# phi_u_var = Kmm - np.dot(LLInvKmm.T,LLInvKmm) - - post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm) + + if not self.Y_speedup or het_noise: + post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm) + else: + post = None + + #====================================================================== + # Compute dL_dthetaL for uncertian input and non-heter noise + #====================================================================== + + if not het_noise: + dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum() + self.midRes['dL_dthetaL'] = dL_dthetaL return logL, dL_dKmm, post def inference_minibatch(self, kern, X, Z, likelihood, Y): """ - The second phase of inference: Computing the derivatives over a minibatch of Y + The second phase of inference: Computing the derivatives over a minibatch of Y Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL return a flag showing whether it reached the end of Y (isEnd) """ @@ -196,14 +236,17 @@ class VarDTC_minibatch(LatentFunctionInference): uncertain_inputs = True else: uncertain_inputs = False - + #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.variance, 1e-6) het_noise = beta.size > 1 # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! #self.YYTfactor = beta*self.get_YYTfactor(Y) - YYT_factor = Y - + if self.Y_speedup and not het_noise: + YYT_factor = self.get_YYTfactor(Y) + else: + YYT_factor = Y + n_start = self.batch_pos n_end = min(self.batchsize+n_start, num_data) if n_end==num_data: @@ -212,66 +255,64 @@ class VarDTC_minibatch(LatentFunctionInference): else: isEnd = False self.batch_pos = n_end - - num_slice = n_end-n_start + Y_slice = YYT_factor[n_start:n_end] X_slice = X[n_start:n_end] - - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - else: + + if not uncertain_inputs: psi0 = kern.Kdiag(X_slice) psi1 = kern.K(X_slice, Z) psi2 = None - + betapsi1 = np.einsum('n,nm->nm',beta,psi1) + elif het_noise: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2 = kern.psi2(Z, X_slice) + betapsi1 = np.einsum('n,nm->nm',beta,psi1) + if het_noise: beta = beta[n_start] # assuming batchsize==1 betaY = beta*Y_slice - betapsi1 = np.einsum('n,nm->nm',beta,psi1) - + #====================================================================== # Load Intermediate Results #====================================================================== - + dL_dpsi2R = self.midRes['dL_dpsi2R'] v = self.midRes['v'] - + #====================================================================== # Compute dL_dpsi #====================================================================== - - dL_dpsi0 = -0.5 * output_dim * (beta * np.ones((n_end-n_start,))) - + + dL_dpsi0 = -output_dim * (beta * np.ones((n_end-n_start,)))/2. + dL_dpsi1 = np.dot(betaY,v.T) - + if uncertain_inputs: dL_dpsi2 = beta* dL_dpsi2R else: dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2. dL_dpsi2 = None - + #====================================================================== # Compute dL_dthetaL #====================================================================== if het_noise: if uncertain_inputs: - psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) + psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2) else: psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) - + dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) else: - if uncertain_inputs: - psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) + if isEnd: + dL_dthetaL = self.midRes['dL_dthetaL'] else: - psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) - - dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum() - + dL_dthetaL = 0. + if uncertain_inputs: grad_dict = {'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, @@ -281,71 +322,92 @@ class VarDTC_minibatch(LatentFunctionInference): grad_dict = {'dL_dKdiag':dL_dpsi0, 'dL_dKnm':dL_dpsi1, 'dL_dthetaL':dL_dthetaL} - + return isEnd, (n_start,n_end), grad_dict -def update_gradients(model): - model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, model.X, model.Z, model.likelihood, model.Y) +def update_gradients(model, mpi_comm=None): + if mpi_comm == None: + Y = model.Y + X = model.X + else: + Y = model.Y_local + X = model.X[model.N_range[0]:model.N_range[1]] + model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y) + het_noise = model.likelihood.variance.size > 1 - + if het_noise: dL_dthetaL = np.empty((model.Y.shape[0],)) else: - dL_dthetaL = 0 - - #gradients w.r.t. kernel - model.kern.update_gradients_full(dL_dKmm, model.Z, None) + dL_dthetaL = np.float64(0.) + kern_grad = model.kern.gradient.copy() - - #gradients w.r.t. Z - model.Z.gradient = model.kern.gradients_X(dL_dKmm, model.Z) - + kern_grad[:] = 0. + model.Z.gradient = 0. + isEnd = False while not isEnd: - isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y) + isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y) if isinstance(model.X, VariationalPosterior): - X_slice = model.X[n_range[0]:n_range[1]] - - dL_dpsi1 = grad_dict['dL_dpsi1']#[None, :] - dL_dpsi2 = grad_dict['dL_dpsi2'][None, :, :] + if (n_range[1]-n_range[0])==X.shape[0]: + X_slice = X + elif mpi_comm ==None: + X_slice = model.X[n_range[0]:n_range[1]] + else: + X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]] + #gradients w.r.t. kernel - model.kern.update_gradients_expectations(variational_posterior=X_slice,Z=model.Z,dL_dpsi0=grad_dict['dL_dpsi0'],dL_dpsi1=dL_dpsi1,dL_dpsi2=dL_dpsi2) + model.kern.update_gradients_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) kern_grad += model.kern.gradient - + #gradients w.r.t. Z model.Z.gradient += model.kern.gradients_Z_expectations( - dL_dpsi0=grad_dict['dL_dpsi0'], - dL_dpsi1=dL_dpsi1, - dL_dpsi2=dL_dpsi2, - Z=model.Z, variational_posterior=X_slice) - + dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice) + #gradients w.r.t. posterior parameters of X - X_grad = model.kern.gradients_qX_expectations( - variational_posterior=X_slice, - Z=model.Z, - dL_dpsi0=grad_dict['dL_dpsi0'], - dL_dpsi1=dL_dpsi1, - dL_dpsi2=dL_dpsi2) - - model.X.mean[n_range[0]:n_range[1]].gradient = X_grad[0] - model.X.variance[n_range[0]:n_range[1]].gradient = X_grad[1] - + X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) + model.set_X_gradients(X_slice, X_grad) + if het_noise: dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL'] else: dL_dthetaL += grad_dict['dL_dthetaL'] - #import ipdb;ipdb.set_trace() - model.grad_dict = grad_dict - if isinstance(model.X, VariationalPosterior): - # Update Log-likelihood - model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X) - # update for the KL divergence - model.variational_prior.update_gradients_KL(model.X) + + # Gather the gradients from multiple MPI nodes + if mpi_comm != None: + if het_noise: + assert False, "Not implemented!" + kern_grad_all = kern_grad.copy() + Z_grad_all = model.Z.gradient.copy() + mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE]) + mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE]) + kern_grad = kern_grad_all + model.Z.gradient = Z_grad_all + + #gradients w.r.t. kernel + model.kern.update_gradients_full(dL_dKmm, model.Z, None) + model.kern.gradient += kern_grad - # Set the gradients w.r.t. kernel - model.kern.gradient = kern_grad + #gradients w.r.t. Z + model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z) + + # Update Log-likelihood + KL_div = model.variational_prior.KL_divergence(X) + # update for the KL divergence + model.variational_prior.update_gradients_KL(X) + + if mpi_comm != None: + KL_div_all = np.array(KL_div) + mpi_comm.Allreduce([np.float64(KL_div), MPI.DOUBLE], [KL_div_all, MPI.DOUBLE]) + KL_div = KL_div_all + [mpi_comm.Allgatherv([pp.copy(), MPI.DOUBLE], [pa, (model.N_list*pa.shape[-1], None), MPI.DOUBLE]) for pp,pa in zip(model.get_X_gradients(X),model.get_X_gradients(model.X))] + from ...models import SSGPLVM + if isinstance(model, SSGPLVM): + grad_pi = np.array(model.variational_prior.pi.gradient) + mpi_comm.Allreduce([grad_pi.copy(), MPI.DOUBLE], [model.variational_prior.pi.gradient, MPI.DOUBLE]) + model._log_marginal_likelihood -= KL_div # dL_dthetaL model.likelihood.update_gradients(dL_dthetaL) diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index 2faf57c6..39529843 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -8,12 +8,13 @@ from _src.mlp import MLP from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52 from _src.independent_outputs import IndependentOutputs, Hierarchical from _src.coregionalize import Coregionalize -from _src.ssrbf import SSRBF # TODO: ZD: did you remove this? from _src.ODE_UY import ODE_UY from _src.ODE_UYC import ODE_UYC from _src.ODE_st import ODE_st from _src.ODE_t import ODE_t from _src.poly import Poly + +from _src.trunclinear import TruncLinear,TruncLinear_inf from _src.splitKern import SplitKern,DiffGenomeKern # TODO: put this in an init file somewhere diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py index ee743f8b..2b0e531f 100644 --- a/GPy/kern/_src/add.py +++ b/GPy/kern/_src/add.py @@ -63,13 +63,16 @@ class Add(CombinationKernel): target = np.zeros(X.shape) [target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts] return target - + + @Cache_this(limit=2, force_kwargs=['which_parts']) def psi0(self, Z, variational_posterior): return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts)) - + + @Cache_this(limit=2, force_kwargs=['which_parts']) def psi1(self, Z, variational_posterior): return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts)) + @Cache_this(limit=2, force_kwargs=['which_parts']) def psi2(self, Z, variational_posterior): psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts)) #return psi2 @@ -88,17 +91,18 @@ class Add(CombinationKernel): # rbf X bias #elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)): elif isinstance(p1, Bias) and isinstance(p2, (RBF, Linear)): - tmp = p2.psi1(Z, variational_posterior) - psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :]) + tmp = p2.psi1(Z, variational_posterior).sum(axis=0) + psi2 += p1.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :]) #elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)): elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)): - tmp = p1.psi1(Z, variational_posterior) - psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :]) + tmp = p1.psi1(Z, variational_posterior).sum(axis=0) + psi2 += p2.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :]) elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)): assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far" tmp1 = p1.psi1(Z, variational_posterior) tmp2 = p2.psi1(Z, variational_posterior) - psi2 += (tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :]) + psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1) + #(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :]) else: raise NotImplementedError, "psi2 cannot be computed for this kernel" return psi2 diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py index 647f2ca7..41ec3cae 100644 --- a/GPy/kern/_src/linear.py +++ b/GPy/kern/_src/linear.py @@ -3,16 +3,13 @@ import numpy as np -from scipy import weave from kern import Kern from ...util.linalg import tdot -from ...util.misc import param_to_array from ...core.parameterization import Param from ...core.parameterization.transformations import Logexp from ...util.caching import Cache_this -from ...core.parameterization import variational -from psi_comp import linear_psi_comp from ...util.config import * +from .psi_comp import PSICOMP_Linear class Linear(Kern): """ @@ -53,7 +50,8 @@ class Linear(Kern): self.variances = Param('variances', variances, Logexp()) self.add_parameter(self.variances) - + self.psicomp = PSICOMP_Linear() + @Cache_this(limit=2) def K(self, X, X2=None): if self.ARD: @@ -95,243 +93,43 @@ class Linear(Kern): def gradients_X(self, dL_dK, X, X2=None): if X2 is None: - return 2.*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1) + return np.einsum('mq,nm->nq',X*self.variances,dL_dK)+np.einsum('nq,nm->mq',X*self.variances,dL_dK) else: return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1) def gradients_X_diag(self, dL_dKdiag, X): return 2.*self.variances*dL_dKdiag[:,None]*X + def input_sensitivity(self): + return np.ones(self.input_dim) * self.variances + #---------------------------------------# # PSI statistics # #---------------------------------------# def psi0(self, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - - return np.einsum('q,nq,nq->n',self.variances,gamma,np.square(mu)+S) -# return (self.variances*gamma*(np.square(mu)+S)).sum(axis=1) - else: - return np.sum(self.variances * self._mu2S(variational_posterior), 1) + return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0] def psi1(self, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - return np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu) -# return (self.variances*gamma*mu).sum(axis=1) - else: - return self.K(variational_posterior.mean, Z) #the variance, it does nothing + return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1] @Cache_this(limit=1) def psi2(self, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - mu2 = np.square(mu) - variances2 = np.square(self.variances) - tmp = np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu) - return np.einsum('nq,q,mq,oq,nq->nmo',gamma,variances2,Z,Z,mu2+S)+\ - np.einsum('nm,no->nmo',tmp,tmp) - np.einsum('nq,q,mq,oq,nq->nmo',np.square(gamma),variances2,Z,Z,mu2) - else: - ZA = Z * self.variances - ZAinner = self._ZAinner(variational_posterior, Z) - return np.dot(ZAinner, ZA.T) + return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2] def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - mu2S = np.square(mu)+S - _dpsi2_dvariance, _, _, _, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma) - grad = np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu) +\ - np.einsum('nmo,nmoq->q',dL_dpsi2,_dpsi2_dvariance) - if self.ARD: - self.variances.gradient = grad - else: - self.variances.gradient = grad.sum() + dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0] + if self.ARD: + self.variances.gradient = dL_dvar else: - #psi1 - self.update_gradients_full(dL_dpsi1, variational_posterior.mean, Z) - # psi0: - tmp = dL_dpsi0[:, None] * self._mu2S(variational_posterior) - if self.ARD: self.variances.gradient += tmp.sum(0) - else: self.variances.gradient += tmp.sum() - #psi2 - if self.ARD: - tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(variational_posterior, Z)[:, :, None, :] * Z[None, None, :, :]) - self.variances.gradient += 2.*tmp.sum(0).sum(0).sum(0) - else: - self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances + self.variances.gradient = dL_dvar.sum() def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - _, _, _, _, _dpsi2_dZ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma) - - grad = np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, self.variances,mu) +\ - np.einsum('nmo,noq->mq',dL_dpsi2,_dpsi2_dZ) - - return grad - else: - #psi1 - grad = self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean) - #psi2 - self._weave_dpsi2_dZ(dL_dpsi2, Z, variational_posterior, grad) - return grad + return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1] def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): - if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - gamma = variational_posterior.binary_prob - mu = variational_posterior.mean - S = variational_posterior.variance - mu2S = np.square(mu)+S - _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma) + return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:] - grad_gamma = np.einsum('n,q,nq->nq',dL_dpsi0,self.variances,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,self.variances,Z,mu) +\ - np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dgamma) - grad_mu = np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*self.variances,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,self.variances,Z) +\ - np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dmu) - grad_S = np.einsum('n,nq,q->nq',dL_dpsi0,gamma,self.variances) + np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dS) - - return grad_mu, grad_S, grad_gamma - else: - grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape) - # psi0 - grad_mu += dL_dpsi0[:, None] * (2.0 * variational_posterior.mean * self.variances) - grad_S += dL_dpsi0[:, None] * self.variances - # psi1 - grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1) - # psi2 - self._weave_dpsi2_dmuS(dL_dpsi2, Z, variational_posterior, grad_mu, grad_S) - - return grad_mu, grad_S - - #--------------------------------------------------# - # Helpers for psi statistics # - #--------------------------------------------------# - - - def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, vp, target_mu, target_S): - # Think N,num_inducing,num_inducing,input_dim - ZA = Z * self.variances - AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :] - AZZA = AZZA + AZZA.swapaxes(1, 2) - AZZA_2 = AZZA/2. - if config.getboolean('parallel', 'openmp'): - pragma_string = '#pragma omp parallel for private(m,mm,q,qq,factor,tmp)' - header_string = '#include ' - weave_options = {'headers' : [''], - 'extra_compile_args': ['-fopenmp -O3'], - 'extra_link_args' : ['-lgomp'], - 'libraries': ['gomp']} - else: - pragma_string = '' - header_string = '' - weave_options = {'extra_compile_args': ['-O3']} - - #Using weave, we can exploit the symmetry of this problem: - code = """ - int n, m, mm,q,qq; - double factor,tmp; - %s - for(n=0;n - """ % header_string - mu = vp.mean - N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu) - weave.inline(code, support_code=support_code, - arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'], - type_converters=weave.converters.blitz,**weave_options) - - - def _weave_dpsi2_dZ(self, dL_dpsi2, Z, vp, target): - AZA = self.variances*self._ZAinner(vp, Z) - - if config.getboolean('parallel', 'openmp'): - pragma_string = '#pragma omp parallel for private(n,mm,q)' - header_string = '#include ' - weave_options = {'headers' : [''], - 'extra_compile_args': ['-fopenmp -O3'], - 'extra_link_args' : ['-lgomp'], - 'libraries': ['gomp']} - else: - pragma_string = '' - header_string = '' - weave_options = {'extra_compile_args': ['-O3']} - - code=""" - int n,m,mm,q; - %s - for(m=0;m - """ % header_string - - N,num_inducing,input_dim = vp.mean.shape[0],Z.shape[0],vp.mean.shape[1] - mu = param_to_array(vp.mean) - weave.inline(code, support_code=support_code, - arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'], - type_converters=weave.converters.blitz,**weave_options) - - - @Cache_this(limit=1, ignore_args=(0,)) - def _mu2S(self, vp): - return np.square(vp.mean) + vp.variance - - @Cache_this(limit=1) - def _ZAinner(self, vp, Z): - ZA = Z*self.variances - inner = (vp.mean[:, None, :] * vp.mean[:, :, None]) - diag_indices = np.diag_indices(vp.mean.shape[1], 2) - inner[:, diag_indices[0], diag_indices[1]] += vp.variance - - return np.dot(ZA, inner).swapaxes(0, 1) # NOTE: self.ZAinner \in [num_inducing x num_data x input_dim]! - - def input_sensitivity(self): - return np.ones(self.input_dim) * self.variances class LinearFull(Kern): def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'): diff --git a/GPy/kern/_src/psi_comp/__init__.py b/GPy/kern/_src/psi_comp/__init__.py index 4c0d373d..5aabd3b1 100644 --- a/GPy/kern/_src/psi_comp/__init__.py +++ b/GPy/kern/_src/psi_comp/__init__.py @@ -1,2 +1,50 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) + +from ....core.parameterization.parameter_core import Pickleable +from GPy.util.caching import Cache_this +from ....core.parameterization import variational +import rbf_psi_comp +import ssrbf_psi_comp +import sslinear_psi_comp +import linear_psi_comp + +class PSICOMP_RBF(object): + + @Cache_this(limit=2, ignore_args=(0,)) + def psicomputations(self, variance, lengthscale, Z, variational_posterior): + if isinstance(variational_posterior, variational.NormalPosterior): + return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior) + elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior) + else: + raise ValueError, "unknown distriubtion received for psi-statistics" + + @Cache_this(limit=2, ignore_args=(0,1,2,3)) + def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + if isinstance(variational_posterior, variational.NormalPosterior): + return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior) + elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior) + else: + raise ValueError, "unknown distriubtion received for psi-statistics" + +class PSICOMP_Linear(object): + + @Cache_this(limit=2, ignore_args=(0,)) + def psicomputations(self, variance, Z, variational_posterior): + if isinstance(variational_posterior, variational.NormalPosterior): + return linear_psi_comp.psicomputations(variance, Z, variational_posterior) + elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior) + else: + raise ValueError, "unknown distriubtion received for psi-statistics" + + @Cache_this(limit=2, ignore_args=(0,1,2,3)) + def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior): + if isinstance(variational_posterior, variational.NormalPosterior): + return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior) + elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior) + else: + raise ValueError, "unknown distriubtion received for psi-statistics" \ No newline at end of file diff --git a/GPy/kern/_src/psi_comp/linear_psi_comp.py b/GPy/kern/_src/psi_comp/linear_psi_comp.py index 22147366..4f064a59 100644 --- a/GPy/kern/_src/psi_comp/linear_psi_comp.py +++ b/GPy/kern/_src/psi_comp/linear_psi_comp.py @@ -2,14 +2,47 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) """ -The package for the Psi statistics computation of the linear kernel for SSGPLVM +The package for the Psi statistics computation of the linear kernel for Bayesian GPLVM """ import numpy as np -from GPy.util.caching import Cache_this -#@Cache_this(limit=1) -def _psi2computations(variance, Z, mu, S, gamma): +def psicomputations(variance, Z, variational_posterior): + """ + Compute psi-statistics for ss-linear kernel + """ + # here are the "statistics" for psi0, psi1 and psi2 + # Produced intermediate results: + # psi0 N + # psi1 NxM + # psi2 MxM + mu = variational_posterior.mean + S = variational_posterior.variance + + psi0 = np.einsum('q,nq->n',variance,np.square(mu)+S) + psi1 = np.einsum('q,mq,nq->nm',variance,Z,mu) + + tmp = np.einsum('q,mq,nq->nm',variance,Z,mu) + psi2 = np.einsum('q,mq,oq,nq->mo',np.square(variance),Z,Z,S) + np.einsum('nm,no->mo',tmp,tmp) + + return psi0, psi1, psi2 + +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + + dL_dvar, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S) + + # Compute for psi0 and psi1 + mu2S = np.square(mu)+S + dL_dvar += np.einsum('n,nq->q',dL_dpsi0,mu2S) + np.einsum('nm,mq,nq->q',dL_dpsi1,Z,mu) + dL_dmu += np.einsum('n,q,nq->nq',dL_dpsi0,2.*variance,mu) + np.einsum('nm,q,mq->nq',dL_dpsi1,variance,Z) + dL_dS += np.einsum('n,q->nq',dL_dpsi0,variance) + dL_dZ += np.einsum('nm,q,nq->mq',dL_dpsi1, variance,mu) + + return dL_dvar, dL_dZ, dL_dmu, dL_dS + +def _psi2computations(dL_dpsi2, variance, Z, mu, S): """ Z - MxQ mu - NxQ @@ -18,34 +51,24 @@ def _psi2computations(variance, Z, mu, S, gamma): """ # here are the "statistics" for psi1 and psi2 # Produced intermediate results: - # _psi2 NxMxM - # _psi2_dvariance NxMxMxQ - # _psi2_dZ NxMxQ - # _psi2_dgamma NxMxMxQ - # _psi2_dmu NxMxMxQ - # _psi2_dS NxMxMxQ + # _psi2_dvariance Q + # _psi2_dZ MxQ + # _psi2_dmu NxQ + # _psi2_dS NxQ - mu2 = np.square(mu) - gamma2 = np.square(gamma) variance2 = np.square(variance) - mu2S = mu2+S # NxQ - common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM - - _dpsi2_dvariance = np.einsum('nq,q,mq,oq->nmoq',2.*(gamma*mu2S-gamma2*mu2),variance,Z,Z)+\ - np.einsum('nq,mq,nq,no->nmoq',gamma,Z,mu,common_sum)+\ - np.einsum('nq,oq,nq,nm->nmoq',gamma,Z,mu,common_sum) - - _dpsi2_dgamma = np.einsum('q,mq,oq,nq->nmoq',variance2,Z,Z,(mu2S-2.*gamma*mu2))+\ - np.einsum('q,mq,nq,no->nmoq',variance,Z,mu,common_sum)+\ - np.einsum('q,oq,nq,nm->nmoq',variance,Z,mu,common_sum) - - _dpsi2_dmu = np.einsum('q,mq,oq,nq,nq->nmoq',variance2,Z,Z,mu,2.*(gamma-gamma2))+\ - np.einsum('nq,q,mq,no->nmoq',gamma,variance,Z,common_sum)+\ - np.einsum('nq,q,oq,nm->nmoq',gamma,variance,Z,common_sum) - - _dpsi2_dS = np.einsum('nq,q,mq,oq->nmoq',gamma,variance2,Z,Z) - - _dpsi2_dZ = 2.*(np.einsum('nq,q,mq,nq->nmq',gamma,variance2,Z,mu2S)+np.einsum('nq,q,nq,nm->nmq',gamma,variance,mu,common_sum) - -np.einsum('nq,q,mq,nq->nmq',gamma2,variance2,Z,mu2)) + common_sum = np.einsum('q,mq,nq->nm',variance,Z,mu) # NxM - return _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ \ No newline at end of file + dL_dvar = np.einsum('mo,nq,q,mq,oq->q',dL_dpsi2,2.*S,variance,Z,Z)+\ + np.einsum('mo,mq,nq,no->q',dL_dpsi2,Z,mu,common_sum)+\ + np.einsum('mo,oq,nq,nm->q',dL_dpsi2,Z,mu,common_sum) + + dL_dmu = np.einsum('mo,q,mq,no->nq',dL_dpsi2,variance,Z,common_sum)+\ + np.einsum('mo,q,oq,nm->nq',dL_dpsi2,variance,Z,common_sum) + + dL_dS = np.empty(S.shape) + dL_dS[:] = np.einsum('mo,q,mq,oq->q',dL_dpsi2,variance2,Z,Z) + + dL_dZ = 2.*(np.einsum('om,q,mq,nq->oq',dL_dpsi2,variance2,Z,S)+np.einsum('om,q,nq,nm->oq',dL_dpsi2,variance,mu,common_sum)) + + return dL_dvar, dL_dmu, dL_dS, dL_dZ diff --git a/GPy/kern/_src/psi_comp/rbf_psi_comp.py b/GPy/kern/_src/psi_comp/rbf_psi_comp.py new file mode 100644 index 00000000..93399ea7 --- /dev/null +++ b/GPy/kern/_src/psi_comp/rbf_psi_comp.py @@ -0,0 +1,162 @@ +""" +The module for psi-statistics for RBF kernel +""" + +import numpy as np +from GPy.util.caching import Cacher + +def psicomputations(variance, lengthscale, Z, variational_posterior): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi0, psi1 and psi2 + # Produced intermediate results: + # _psi1 NxM + mu = variational_posterior.mean + S = variational_posterior.variance + + psi0 = np.empty(mu.shape[0]) + psi0[:] = variance + psi1 = _psi1computations(variance, lengthscale, Z, mu, S) + psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0) + return psi0, psi1, psi2 + +def __psi1computations(variance, lengthscale, Z, mu, S): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 + # Produced intermediate results: + # _psi1 NxM + + lengthscale2 = np.square(lengthscale) + + # psi1 + _psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N + _psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.) + _psi1 = variance*np.exp(_psi1_log) + + return _psi1 + +def __psi2computations(variance, lengthscale, Z, mu, S): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi2 + # Produced intermediate results: + # _psi2 MxM + + lengthscale2 = np.square(lengthscale) + + _psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N + _psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM + Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ + denom = 1./(2.*S+lengthscale2) + _psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom) + _psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2) + + + return _psi2 + +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + ARD = (len(lengthscale)!=1) + + dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance) + dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance) + + dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2 + + dL_dlengscale = dl_psi1 + dl_psi2 + if not ARD: + dL_dlengscale = dL_dlengscale.sum() + + dL_dmu = dmu_psi1 + dmu_psi2 + dL_dS = dS_psi1 + dS_psi2 + dL_dZ = dZ_psi1 + dZ_psi2 + + return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS + +def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S): + """ + dL_dpsi1 - NxM + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 + # Produced intermediate results: dL_dparams w.r.t. psi1 + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + + _psi1 = _psi1computations(variance, lengthscale, Z, mu, S) + Lpsi1 = dL_dpsi1*_psi1 + Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ + denom = 1./(S+lengthscale2) + Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ + _dL_dvar = Lpsi1.sum()/variance + _dL_dmu = np.einsum('nm,nmq,nq->nq',Lpsi1,Zmu,denom) + _dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2. + _dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom) + _dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale) + + return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS + +def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + dL_dpsi2 - MxM + """ + # here are the "statistics" for psi2 + # Produced the derivatives w.r.t. psi2: + # _dL_dvariance 1 + # _dL_dlengthscale Q + # _dL_dZ MxQ + # _dL_dgamma NxQ + # _dL_dmu NxQ + # _dL_dS NxQ + + lengthscale2 = np.square(lengthscale) + denom = 1./(2*S+lengthscale2) + denom2 = np.square(denom) + + _psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM + + Lpsi2 = dL_dpsi2[None,:,:]*_psi2 + Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N + Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ + Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ + Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ + Lpsi2Zhat = Lpsi2Z + Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2 + + _dL_dvar = Lpsi2sum.sum()*2/variance + _dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat) + _dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None] + _dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \ + 2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom) + _dL_dl = 2*lengthscale* ((S/lengthscale2*denom+np.square(mu*denom))*Lpsi2sum[:,None]+(Lpsi2Z2-Lpsi2Z2p)/(2*np.square(lengthscale2))- + (2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0) + + return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS + +_psi1computations = Cacher(__psi1computations, limit=1) +_psi2computations = Cacher(__psi2computations, limit=1) diff --git a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py new file mode 100644 index 00000000..73c2c33b --- /dev/null +++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py @@ -0,0 +1,411 @@ +""" +The module for psi-statistics for RBF kernel +""" + +import numpy as np +from ....util.caching import Cache_this +from . import PSICOMP_RBF +from ....util import gpu_init + +try: + import pycuda.gpuarray as gpuarray + from pycuda.compiler import SourceModule + from ....util.linalg_gpu import sum_axis +except: + pass + +gpu_code = """ + // define THREADNUM + + #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) + #define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n) + #define IDX_NQ(n,q) (q*N+n) + #define IDX_NM(n,m) (m*N+n) + #define IDX_MQ(m,q) (q*M+m) + #define IDX_MM(m1,m2) (m2*M+m1) + #define IDX_NQB(n,q,b) ((b*Q+q)*N+n) + #define IDX_QB(q,b) (b*Q+q) + + // Divide data evenly + __device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) { + int residue = (total_data)%psize; + if(pidx= blockDim.x) { + for(int i=blockDim.x+threadIdx.x; i=1;s=s/2) { + if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];} + __syncthreads(); + } + } + + __global__ void compDenom(double *log_denom1, double *log_denom2, double *l, double *S, int N, int Q) + { + int n_start, n_end; + divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end); + + for(int i=n_start*Q+threadIdx.x; in',variance,gamma,np.square(mu)+S) + psi1 = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) + mu2 = np.square(mu) + variances2 = np.square(variance) + tmp = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) + psi2 = np.einsum('nq,q,mq,oq,nq->mo',gamma,variances2,Z,Z,mu2+S)+\ + np.einsum('nm,no->mo',tmp,tmp) - np.einsum('nq,q,mq,oq,nq->mo',np.square(gamma),variances2,Z,Z,mu2) + + return psi0, psi1, psi2 + +def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + + dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma) + + # Compute for psi0 and psi1 + mu2S = np.square(mu)+S + dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu) + dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu) + dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z) + dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance) + dL_dZ += np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu) + + return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma + +def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 and psi2 + # Produced intermediate results: + # _psi2_dvariance Q + # _psi2_dZ MxQ + # _psi2_dgamma NxQ + # _psi2_dmu NxQ + # _psi2_dS NxQ + + mu2 = np.square(mu) + gamma2 = np.square(gamma) + variance2 = np.square(variance) + mu2S = mu2+S # NxQ + common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM + + dL_dvar = np.einsum('mo,nq,q,mq,oq->q',dL_dpsi2,2.*(gamma*mu2S-gamma2*mu2),variance,Z,Z)+\ + np.einsum('mo,nq,mq,nq,no->q',dL_dpsi2,gamma,Z,mu,common_sum)+\ + np.einsum('mo,nq,oq,nq,nm->q',dL_dpsi2,gamma,Z,mu,common_sum) + + dL_dgamma = np.einsum('mo,q,mq,oq,nq->nq',dL_dpsi2,variance2,Z,Z,(mu2S-2.*gamma*mu2))+\ + np.einsum('mo,q,mq,nq,no->nq',dL_dpsi2,variance,Z,mu,common_sum)+\ + np.einsum('mo,q,oq,nq,nm->nq',dL_dpsi2,variance,Z,mu,common_sum) + + dL_dmu = np.einsum('mo,q,mq,oq,nq,nq->nq',dL_dpsi2,variance2,Z,Z,mu,2.*(gamma-gamma2))+\ + np.einsum('mo,nq,q,mq,no->nq',dL_dpsi2,gamma,variance,Z,common_sum)+\ + np.einsum('mo,nq,q,oq,nm->nq',dL_dpsi2,gamma,variance,Z,common_sum) + + dL_dS = np.einsum('mo,nq,q,mq,oq->nq',dL_dpsi2,gamma,variance2,Z,Z) + + dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,mu2S)+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum) + -np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma2,variance2,Z,mu2)) + + return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py index 5e30b5c5..6302a590 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_comp.py @@ -6,10 +6,8 @@ The package for the psi statistics computation """ import numpy as np -from GPy.util.caching import Cache_this -@Cache_this(limit=1) -def psicomputations(variance, lengthscale, Z, mu, S, gamma): +def psicomputations(variance, lengthscale, Z, variational_posterior): """ Z - MxQ mu - NxQ @@ -19,6 +17,9 @@ def psicomputations(variance, lengthscale, Z, mu, S, gamma): # here are the "statistics" for psi0, psi1 and psi2 # Produced intermediate results: # _psi1 NxM + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob psi0 = np.empty(mu.shape[0]) psi0[:] = variance @@ -37,7 +38,6 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): # Produced intermediate results: # _psi1 NxM - lengthscale2 = np.square(lengthscale) # psi1 @@ -147,11 +147,6 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma): _dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2. # NxQ _dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z)) _dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z)) - -# _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ -# _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ -# _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ -# _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma @@ -199,15 +194,6 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): _dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq) _dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq) _dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z)) -# print _psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq #+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) _dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)) - - -# _dpsi2_dvariance = 2. * _psi2/variance # NxMxM -# _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ -# _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ -# _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ -# _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ -# _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index f49dc52a..1a9d2058 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -1,535 +1,474 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) """ -The package for the psi statistics computation on GPU +The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM """ import numpy as np -from GPy.util.caching import Cache_this - +from ....util.caching import Cache_this +from . import PSICOMP_RBF from ....util import gpu_init try: import pycuda.gpuarray as gpuarray - from scikits.cuda import cublas - from pycuda.reduction import ReductionKernel - from pycuda.elementwise import ElementwiseKernel - from ....util import linalg_gpu - - - # The kernel form computing psi1 het_noise - comp_psi1 = ElementwiseKernel( - "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", - "psi1[i] = comp_psi1_element(var, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", - "comp_psi1", - preamble=""" - #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) - #define IDX_NQ(n,q) (q*N+n) - #define IDX_MQ(m,q) (q*M+m) - #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) - { - int n = idx%N; - int m = idx/N; - double psi1_exp=0; - for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_psi2_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) - { - // psi2 (n,m1,m2) - int m2 = idx/(M*N); - int m1 = (idx%(M*N))/N; - int n = idx%N; - - double psi2_exp=0; - for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_dpsi1_dvar_element(double *psi1_neq, double *psi1exp1, double *psi1exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) - { - int n = idx%N; - int m = idx/N; - - double psi1_sum = 0; - for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_dpsi2_dvar_element(double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) - { - // psi2 (n,m1,m2) - int m2 = idx/(M*N); - int m1 = (idx%(M*N))/N; - int n = idx%N; - - double psi2_sum=0; - for(int q=0;q= blockDim.x) { + for(int i=blockDim.x+threadIdx.x; i=1;s=s/2) { + if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];} + __syncthreads(); + } + } + + __global__ void compDenom(double *log_denom1, double *log_denom2, double *log_gamma, double*log_gamma1, double *gamma, double *l, double *S, int N, int Q) + { + int n_start, n_end; + divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end); - if self.gpuCacheAll!=None and self.gpuCacheAll['mu_gpu'].shape[0] reallocate - self._releaseMemory() + for(int i=n_start*Q+threadIdx.x; iexp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2)); + } + psi1[IDX_NM(n,m)] = var*exp(log_psi1); + } + } + } + + __global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q) + { + int psi2_idx_start, psi2_idx_end; + __shared__ double psi2_local[THREADNUM]; + divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end); + + for(int psi2_idx=psi2_idx_start; psi2_idxexp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2)); + } + double exp_psi2_n = exp(log_psi2_n); + psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n; + if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;} + psi2_local[threadIdx.x] += exp_psi2_n; + } + __syncthreads(); + reduce_sum(psi2_local, THREADNUM); + if(threadIdx.x==0) { + psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0]; + if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; } + } + __syncthreads(); + } + } + + __global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi1, double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q) + { + int m_start, m_end; + __shared__ double g_local[THREADNUM]; + divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end); + int P = int(ceil(double(N)/THREADNUM)); + + double dvar_local = 0; + for(int q=0;qexp2) { + d_exp1 = 1.; + d_exp2 = exp(exp2-exp1); + } else { + d_exp1 = exp(exp1-exp2); + d_exp2 = 1.; + } + double exp_sum = d_exp1+d_exp2; + + dmu_local += lpsi1*Zmu*d_exp1/(denom*exp_sum); + dS_local += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum); + dgamma_local += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum; + dl_local += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zmq*Zmq/(lq*lq)*d_exp2)/(2.*exp_sum); + g_local[threadIdx.x] = lpsi1*(-Zmu/denom*d_exp1-Zmq/lq*d_exp2)/exp_sum; + } + __syncthreads(); + reduce_sum(g_local, pexp2) { + d_exp1 = 1.; + d_exp2 = exp(exp2-exp1); + } else { + d_exp1 = exp(exp1-exp2); + d_exp2 = 1.; + } + double exp_sum = d_exp1+d_exp2; + + dmu_local += lpsi2*muZhat/denom*d_exp1/exp_sum; + dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum; + dgamma_local += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum; + dl_local += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum; + g_local[threadIdx.x] += 2.*lpsi2*((muZhat/denom-dZ/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum; + } + } + __syncthreads(); + reduce_sum(g_local, p