diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index f83a5bc0..3bd5c347 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -66,7 +66,6 @@ class VarDTC_GPU(LatentFunctionInference): 'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'), 'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN 'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN - 'psi2_t_gpu' :gpuarray.empty((num_inducing*num_inducing*self.batchsize),np.float64,order='F'), # inference_minibatch 'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), 'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), @@ -122,6 +121,89 @@ class VarDTC_GPU(LatentFunctionInference): else: return jitchol(tdot(Y)) + def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, het_noise): + num_inducing, input_dim = Z.shape[0], Z.shape[1] + num_data, output_dim = Y.shape + trYYT = self._trYYT + psi1Y_gpu = self.gpuCache['psi1Y_gpu'] + psi2_gpu = self.gpuCache['psi2_gpu'] + beta_gpu = self.gpuCache['beta_gpu'] + YT_gpu = self.gpuCache['YT_gpu'] + betaYT_gpu = self.gpuCache['betaYT_gpu'] + + beta_gpu.fill(beta) + betaYT_gpu.fill(0.) + cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1) + YRY_full = trYYT*beta + + if kern.useGPU: + psi1Y_gpu.fill(0.) + psi2_gpu.fill(0.) + psi0_full = 0 + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + ndata = n_end - n_start + X_slice = X[n_start:n_end] + betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1p_gpu = kern.psi1(Z, X_slice) + psi2p_gpu = kern.psi2(Z, X_slice) + else: + psi0 = kern.Kdiag(X_slice) + psi1p_gpu = kern.K(X_slice, Z) + + cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) + + psi0_full += psi0.sum() + + if uncertain_inputs: + sum_axis(psi2_gpu,psi2p_gpu,1,1) + else: + cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) + + psi0_full *= beta + if uncertain_inputs: + cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1) + + else: + psi2_full = np.zeros((num_inducing,num_inducing)) + psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM + psi0_full = 0. + YRY_full = 0. + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + Y_slice = Y[n_start:n_end] + X_slice = X[n_start:n_end] + + if het_noise: + b = beta[n_start] + YRY_full += np.inner(Y_slice, Y_slice)*b + else: + b = beta + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2_full += kern.psi2(Z, X_slice)*b + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + psi2_full += np.dot(psi1.T,psi1)*b + + psi0_full += psi0.sum()*b + psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM + + if not het_noise: + YRY_full = trYYT*beta + psi1Y_gpu.set(psi1Y_full) + psi2_gpu.set(psi2_full) + + return psi0_full, YRY_full + def inference_likelihood(self, kern, X, Z, likelihood, Y): """ The first phase of inference: @@ -146,118 +228,10 @@ class VarDTC_GPU(LatentFunctionInference): else: uncertain_inputs = False - - trYYT = self._trYYT - psi1Y_gpu = self.gpuCache['psi1Y_gpu'] psi2_gpu = self.gpuCache['psi2_gpu'] - beta_gpu = self.gpuCache['beta_gpu'] - YT_gpu = self.gpuCache['YT_gpu'] - betaYT_gpu = self.gpuCache['betaYT_gpu'] - psi2_t_gpu = self.gpuCache['psi2_t_gpu'] - if het_noise: - beta_gpu.set(np.asfortranarray(beta)) - mul_bcast(betaYT_gpu,beta_gpu,YT_gpu,beta_gpu.size) - YRY_full = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, betaYT_gpu.gpudata, 1, YT_gpu.gpudata, 1) - else: - beta_gpu.fill(beta) - betaYT_gpu.fill(0.) - cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1) - YRY_full = trYYT*beta - - if kern.useGPU: - psi1Y_gpu.fill(0.) - psi2_gpu.fill(0.) - psi0_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - ndata = n_end - n_start - X_slice = X[n_start:n_end] - beta_gpu_slice = beta_gpu[n_start:n_end] - betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - if ndata==self.batchsize: - psi2_t_gpu_slice = psi2_t_gpu - else: - psi2_t_gpu_slice = psi2_t_gpu[:num_inducing*num_inducing*ndata] - if uncertain_inputs: - psi0p_gpu = kern.psi0(Z, X_slice) - psi1p_gpu = kern.psi1(Z, X_slice) - psi2p_gpu = kern.psi2(Z, X_slice) - else: - psi0p_gpu = kern.Kdiag(X_slice) - psi1p_gpu = kern.K(X_slice, Z) - - cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) - - if het_noise: - psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1) - else: - psi0_full += gpuarray.sum(psi0p_gpu).get() - - if uncertain_inputs: - if het_noise: - mul_bcast(psi2_t_gpu_slice,beta_gpu_slice,psi2p_gpu,beta_gpu_slice.size) - sum_axis(psi2_gpu,psi2_t_gpu_slice,1,ndata) - else: - sum_axis(psi2_gpu,psi2p_gpu,1,ndata) - else: - if het_noise: - psi1_t_gpu = psi2_t_gpu_slice[:,num_inducing*ndata] - mul_bcast(psi1_t_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) - cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, 1.0, psi1p_gpu.gpudata, ndata, psi1_t_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) - else: - cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) - - if not het_noise: - psi0_full *= beta - if uncertain_inputs: - cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1) - - else: - psi2_full = np.zeros((num_inducing,num_inducing),order='F') - psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD - psi0_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - Y_slice = Y[n_start:n_end] - X_slice = X[n_start:n_end] - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - - if het_noise: - beta_slice = beta[n_start:n_end] - psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD - else: - psi0_full += psi0.sum() - psi1Y_full += np.dot(psi1.T,Y_slice) # MxD - - if uncertain_inputs: - if het_noise: - psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2) - else: - psi2_full += psi2 - else: - if het_noise: - psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1) - else: - psi2_full += np.outer(psi1.T, psi1.T) - - if not het_noise: - psi0_full *= beta - psi1Y_full *= beta - psi2_full *= beta - - psi1Y_gpu.set(psi1Y_full) - psi2_gpu.set(psi2_full) + psi0_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs, het_noise) #====================================================================== # Compute Common Components diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py index dc4f45d5..54bc11ac 100644 --- a/GPy/inference/latent_function_inference/var_dtc_parallel.py +++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py @@ -73,13 +73,14 @@ class VarDTC_minibatch(LatentFunctionInference): else: return jitchol(tdot(Y)) - def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, het_noise): + def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs): num_inducing = Z.shape[0] num_data, output_dim = Y.shape if self.batchsize == None: self.batchsize = num_data + het_noise = beta.size > 1 trYYT = self.get_trYYT(Y) @@ -88,46 +89,30 @@ class VarDTC_minibatch(LatentFunctionInference): psi0_full = 0. YRY_full = 0. - for n_start in xrange(0,num_data,self.batchsize): - + for n_start in xrange(0,num_data,self.batchsize): n_end = min(self.batchsize+n_start, num_data) - Y_slice = Y[n_start:n_end] X_slice = X[n_start:n_end] + if het_noise: + b = beta[n_start] + YRY_full += np.inner(Y_slice, Y_slice)*b + else: + b = beta + if uncertain_inputs: psi0 = kern.psi0(Z, X_slice) psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) + psi2_full += kern.psi2(Z, X_slice)*b else: psi0 = kern.Kdiag(X_slice) psi1 = kern.K(X_slice, Z) - psi2 = None - - if het_noise: - beta_slice = beta[n_start:n_end] - psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(beta_slice*Y_slice.T,psi1) # DxM - YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() - else: - psi0_full += psi0.sum() - psi1Y_full += np.dot(Y_slice.T,psi1) # DxM - - if uncertain_inputs: - if het_noise: - psi2_full += beta_slice*psi2 - else: - psi2_full += psi2 - else: - if het_noise: - psi2_full += beta_slice*np.outer(psi1,psi1) - else: - psi2_full += np.dot(psi1.T,psi1) + psi2_full += np.dot(psi1.T,psi1)*b + psi0_full += psi0.sum()*b + psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM + if not het_noise: - psi0_full *= beta - psi1Y_full *= beta - psi2_full *= beta YRY_full = trYYT*beta if self.mpi_comm != None: @@ -168,7 +153,7 @@ class VarDTC_minibatch(LatentFunctionInference): if het_noise: self.batchsize = 1 - psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs, het_noise) + psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) #====================================================================== # Compute Common Components diff --git a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py index 777d4749..270801ff 100644 --- a/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py @@ -248,7 +248,6 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF): 'Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'), 'mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), 'S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), - 'psi0_gpu' :gpuarray.empty((N,),np.float64,order='F'), 'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'), 'psi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'), 'psi2n_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'), @@ -265,6 +264,10 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF): 'grad_mu_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'), 'grad_S_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'), } + else: + assert N==self.gpuCache['mu_gpu'].shape[0] + assert M==self.gpuCache['Z_gpu'].shape[0] + assert Q==self.gpuCache['l_gpu'].shape[0] def sync_params(self, lengthscale, Z, mu, S): if len(lengthscale)==1: @@ -299,7 +302,6 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF): self._initGPUCache(N,M,Q) self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance) - psi0_gpu = self.gpuCache['psi0_gpu'] psi1_gpu = self.gpuCache['psi1_gpu'] psi2_gpu = self.gpuCache['psi2_gpu'] psi2n_gpu = self.gpuCache['psi2n_gpu'] @@ -308,14 +310,15 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF): mu_gpu = self.gpuCache['mu_gpu'] S_gpu = self.gpuCache['S_gpu'] - psi0_gpu.fill(variance) + psi0 = np.empty((N,)) + psi0[:] = variance self.g_psi1computations(psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1)) self.g_psi2computations(psi2_gpu, psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1)) if self.GPU_direct: - return psi0_gpu, psi1_gpu, psi2_gpu + return psi0, psi1_gpu, psi2_gpu else: - return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get() + return psi0, psi1_gpu.get(), psi2_gpu.get() @Cache_this(limit=1, ignore_args=(0,1,2,3)) def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): @@ -340,17 +343,19 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF): if self.GPU_direct: dL_dpsi1_gpu = dL_dpsi1 dL_dpsi2_gpu = dL_dpsi2 + dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get() else: dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1)) dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2)) + dL_dpsi0_sum = dL_dpsi0.sum() self.reset_derivative() self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1)) self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1)) - dL_dvar = np.sum(dL_dpsi0) + gpuarray.sum(dvar_gpu).get() + dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get() sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum) dL_dmu = grad_mu_gpu.get() sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum) diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index b77768ae..65ab2469 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -62,6 +62,9 @@ class BayesianGPLVM(SparseGP): else: from ..inference.latent_function_inference.var_dtc import VarDTC inference_method = VarDTC() + + if kernel.useGPU and isinstance(inference_method, VarDTC_GPU): + kernel.psicomp.GPU_direct = True SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs) self.add_parameter(self.X, index=0) diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index cf6d483e..1b9b0594 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -12,6 +12,9 @@ from ..util import gpu_init try: from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel + import scikits.cuda.linalg as culinalg + from scikits.cuda import cublas + from scikits.cuda.cula import culaExceptions # log|A| for A is a low triangle matrix # logDiagSum(A, A.shape[0]+1) @@ -60,3 +63,27 @@ try: except: pass + +def jitchol(A, L, cublas_handle, maxtries=5): + try: + cublas.cublasDcopy(cublas_handle, A.size, A.gpudata, 1, L.gpudata, 1) + culinalg.cho_factor(L,'L') + except culaExceptions: + + + diagA = np.diag(A) + if np.any(diagA <= 0.): + raise linalg.LinAlgError, "not pd: non-positive diagonal elements" + jitter = diagA.mean() * 1e-6 + while maxtries > 0 and np.isfinite(jitter): + print 'Warning: adding jitter of {:.10e}'.format(jitter) + try: + return linalg.cholesky(A + np.eye(A.shape[0]).T * jitter, lower=True) + except: + jitter *= 10 + finally: + maxtries -= 1 + raise linalg.LinAlgError, "not positive definite, even with jitter." + + +