From 2b1c1614d9b58ba1cc1b79ac1eb602c596d9a963 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Mon, 24 Mar 2014 12:21:29 +0000 Subject: [PATCH 01/33] [GPU] var_dtc_gpu in progress --- .../latent_function_inference/var_dtc_gpu.py | 308 ++++++++++++++++++ 1 file changed, 308 insertions(+) create mode 100644 GPy/inference/latent_function_inference/var_dtc_gpu.py diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py new file mode 100644 index 00000000..d9d9293e --- /dev/null +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -0,0 +1,308 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +from posterior import Posterior +from ...util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs +from ...util import diag +from ...core.parameterization.variational import VariationalPosterior +import numpy as np +from ...util.misc import param_to_array +log_2_pi = np.log(2*np.pi) + +try: + import scikits.cuda.linalg as culinalg + import pycuda.gpuarray as gpuarray + from scikits.cuda import cublas + import pycuda.autoinit +except: + print 'Error in importing GPU modules!' + +class VarDTC_GPU(object): + """ + An object for inference when the likelihood is Gaussian, but we want to do sparse inference. + + The function self.inference returns a Posterior object, which summarizes + the posterior. + + For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it. + + """ + const_jitter = np.float64(1e-6) + def __init__(self, batchsize, limit=1): + + self.batchsize = batchsize + + # Cache functions + from ...util.caching import Cacher + self.get_trYYT = Cacher(self._get_trYYT, limit) + self.get_YYTfactor = Cacher(self._get_YYTfactor, limit) + + self.midRes = {} + self.batch_pos = 0 # the starting position of the current mini-batch + + # Initialize GPU environment + culinalg.init() + self.cublas_handle = cublas.cublasCreate() + + def set_limit(self, limit): + self.get_trYYT.limit = limit + self.get_YYTfactor.limit = limit + + def _get_trYYT(self, Y): + return param_to_array(np.sum(np.square(Y))) + + def _get_YYTfactor(self, Y): + """ + find a matrix L which satisfies LLT = YYT. + + Note that L may have fewer columns than Y. + """ + N, D = Y.shape + if (N>=D): + return param_to_array(Y) + else: + return jitchol(tdot(Y)) + + def inference_likelihood(self, kern, X, Z, likelihood, Y): + """ + The first phase of inference: + Compute: log-likelihood, dL_dKmm + + Cached intermediate results: Kmm, KmmInv, + """ + + num_inducing = Z.shape[0] + num_data, output_dim = Y.shape + + if isinstance(X, VariationalPosterior): + uncertain_inputs = True + else: + uncertain_inputs = False + + #see whether we've got a different noise variance for each datum + beta = 1./np.fmax(likelihood.variance, 1e-6) + het_noise = beta.size > 1 + trYYT = self.get_trYYT(Y) + + + psi2_full = np.zeros((num_inducing,num_inducing)) + psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM + psi0_full = 0 + YRY_full = 0 + + for n_start in xrange(0,num_data,self.batchsize): + + n_end = min(self.batchsize+n_start, num_data) + + Y_slice = Y[n_start:n_end] + X_slice = X[n_start:n_end] + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2 = kern.psi2(Z, X_slice) + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + psi2 = None + + if het_noise: + beta_slice = beta[n_start:n_end] + psi0_full += (beta_slice*psi0).sum() + psi1Y_full += np.dot(beta_slice*Y_slice.T,psi1) # DxM + YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() + else: + psi0_full += psi0.sum() + psi1Y_full += np.dot(Y_slice.T,psi1) # DxM + + + if uncertain_inputs: + if het_noise: + psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2) + else: + psi2_full += psi2.sum(axis=0) + else: + if het_noise: + psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1) + else: + psi2_full += tdot(psi1.T) + + if not het_noise: + psi0_full *= beta + psi1Y_full *= beta + psi2_full *= beta + YRY_full = trYYT*beta + + psi0_gpu = gpuarray.to_gpu(np.asfortranarray(psi0_full)) + psi1Y_gpu = gpuarray.to_gpu(np.asfortranarray(psi1Y_full)) + psi2_gpu = gpuarray.to_gpu(np.asfortranarray(psi2_full)) + YRY_gpu = gpuarray.to_gpu(np.asfortranarray(YRY_full)) + + #====================================================================== + # Compute Common Components + #====================================================================== + + Kmm = kern.K(Z).copy() + Kmm_gpu = gpuarray.to_gpu(np.asfortranarray(Kmm)) + + diag.add(Kmm, self.const_jitter) + ones_gpu = gpuarray.empty(num_inducing, np.float64) + cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1) + assert np.allclose(Kmm, Kmm_gpu.get()) + + Lm = jitchol(Kmm) + Lm_gpu = Kmm_gpu.copy() + Lm_gpu = culinalg.cho_factor(Lm_gpu,'L') + assert np.allclose(Lm,Lm_gpu.get()) + + Lambda = Kmm+psi2_full + LL = jitchol(Lambda) + Lambda_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + cublas.cublasDaxpy(self.cublas_handle, Kmm_gpu.size, np.float64(1.0), Kmm_gpu.gpudata, 1, psi2_gpu.gpudata, 1) + LL_gpu = Lambda_gpu.copy() + LL_gpu = culinalg.cho_factor(LL_gpu,'L') + assert np.allclose(LL,LL_gpu.get()) + + b,_ = dtrtrs(LL, psi1Y_full.T) + bbt = np.square(b).sum() + + + v,_ = dtrtrs(LL.T,b,lower=False) + vvt = np.einsum('md,od->mo',v,v) + LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') + + Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T + LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0] + KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0] + KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T + + dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2 + + # Cache intermediate results + self.midRes['dL_dpsi2R'] = dL_dpsi2R + self.midRes['v'] = v + + #====================================================================== + # Compute log-likelihood + #====================================================================== + if het_noise: + logL_R = -np.log(beta).sum() + else: + logL_R = -num_data*np.log(beta) + logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) + + #====================================================================== + # Compute dL_dKmm + #====================================================================== + + dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2. + + #====================================================================== + # Compute the Posterior distribution of inducing points p(u|Y) + #====================================================================== + + post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm) + + return logL, dL_dKmm, post + + def inference_minibatch(self, kern, X, Z, likelihood, Y): + """ + The second phase of inference: Computing the derivatives over a minibatch of Y + Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL + return a flag showing whether it reached the end of Y (isEnd) + """ + + num_data, output_dim = Y.shape + + if isinstance(X, VariationalPosterior): + uncertain_inputs = True + else: + uncertain_inputs = False + + #see whether we've got a different noise variance for each datum + beta = 1./np.fmax(likelihood.variance, 1e-6) + het_noise = beta.size > 1 + # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! + #self.YYTfactor = beta*self.get_YYTfactor(Y) + YYT_factor = Y + + n_start = self.batch_pos + n_end = min(self.batchsize+n_start, num_data) + if n_end==num_data: + isEnd = True + self.batch_pos = 0 + else: + isEnd = False + self.batch_pos = n_end + + num_slice = n_end-n_start + Y_slice = YYT_factor[n_start:n_end] + X_slice = X[n_start:n_end] + + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2 = kern.psi2(Z, X_slice) + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + psi2 = None + + if het_noise: + beta = beta[n_start:n_end] + + betaY = beta*Y_slice + betapsi1 = np.einsum('n,nm->nm',beta,psi1) + + #====================================================================== + # Load Intermediate Results + #====================================================================== + + dL_dpsi2R = self.midRes['dL_dpsi2R'] + v = self.midRes['v'] + + #====================================================================== + # Compute dL_dpsi + #====================================================================== + + dL_dpsi0 = -0.5 * output_dim * (beta * np.ones((n_end-n_start,))) + + dL_dpsi1 = np.dot(betaY,v.T) + + if uncertain_inputs: + dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R) + else: + dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2. + dL_dpsi2 = None + + #====================================================================== + # Compute dL_dthetaL + #====================================================================== + + if het_noise: + if uncertain_inputs: + psiR = np.einsum('mo,nmo->n',dL_dpsi2R,psi2) + else: + psiR = np.einsum('nm,no,mo->n',psi1,psi1,dL_dpsi2R) + + dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) + else: + if uncertain_inputs: + psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) + else: + psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) + + dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum() + + if uncertain_inputs: + grad_dict = {'dL_dpsi0':dL_dpsi0, + 'dL_dpsi1':dL_dpsi1, + 'dL_dpsi2':dL_dpsi2, + 'dL_dthetaL':dL_dthetaL} + else: + grad_dict = {'dL_dKdiag':dL_dpsi0, + 'dL_dKnm':dL_dpsi1, + 'dL_dthetaL':dL_dthetaL} + + return isEnd, (n_start,n_end), grad_dict + From 029abe8536c843fec0065a5165818c2311a55da4 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Mon, 24 Mar 2014 16:19:30 +0000 Subject: [PATCH 02/33] [GPU] in progress --- .../latent_function_inference/var_dtc_gpu.py | 55 +++++++++++++++---- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index d9d9293e..36475fbb 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -86,7 +86,7 @@ class VarDTC_GPU(object): psi2_full = np.zeros((num_inducing,num_inducing)) - psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM + psi1Y_full = np.zeros((num_inducing,output_dim)) # DxM psi0_full = 0 YRY_full = 0 @@ -109,11 +109,11 @@ class VarDTC_GPU(object): if het_noise: beta_slice = beta[n_start:n_end] psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(beta_slice*Y_slice.T,psi1) # DxM + psi1Y_full += np.dot(psi1,beta_slice[:,None]*Y_slice) # DxM YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() else: psi0_full += psi0.sum() - psi1Y_full += np.dot(Y_slice.T,psi1) # DxM + psi1Y_full += np.dot(psi1,Y_slice) # DxM if uncertain_inputs: @@ -144,37 +144,68 @@ class VarDTC_GPU(object): Kmm = kern.K(Z).copy() Kmm_gpu = gpuarray.to_gpu(np.asfortranarray(Kmm)) - diag.add(Kmm, self.const_jitter) ones_gpu = gpuarray.empty(num_inducing, np.float64) + ones_gpu.fill(1.0) cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1) assert np.allclose(Kmm, Kmm_gpu.get()) Lm = jitchol(Kmm) + # Lm_gpu = Kmm_gpu.copy() - Lm_gpu = culinalg.cho_factor(Lm_gpu,'L') - assert np.allclose(Lm,Lm_gpu.get()) + culinalg.cho_factor(Lm_gpu,'L') + print np.abs(np.tril(Lm)-np.tril(Lm_gpu.get())).max() Lambda = Kmm+psi2_full LL = jitchol(Lambda) + # Lambda_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) - cublas.cublasDaxpy(self.cublas_handle, Kmm_gpu.size, np.float64(1.0), Kmm_gpu.gpudata, 1, psi2_gpu.gpudata, 1) + cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lambda_gpu.gpudata, 1) + cublas.cublasDaxpy(self.cublas_handle, psi2_gpu.size, np.float64(1.0), psi2_gpu.gpudata, 1, Lambda_gpu.gpudata, 1) LL_gpu = Lambda_gpu.copy() - LL_gpu = culinalg.cho_factor(LL_gpu,'L') - assert np.allclose(LL,LL_gpu.get()) - - b,_ = dtrtrs(LL, psi1Y_full.T) - bbt = np.square(b).sum() + culinalg.cho_factor(LL_gpu,'L') + print np.abs(np.tril(LL)-np.tril(LL_gpu.get())).max() + b,_ = dtrtrs(LL, psi1Y_full) + bbt_cpu = np.square(b).sum() + # + b_gpu = gpuarray.empty((num_inducing,output_dim),np.float64) + cublas.cublasDcopy(self.cublas_handle, b_gpu.size, psi1Y_gpu.gpudata, 1, b_gpu.gpudata, 1) + cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, b_gpu.gpudata, num_inducing) + bbt = cublas.cublasDdot(self.cublas_handle, b_gpu.size, b_gpu, 1, b_gpu, 1) + print np.abs(bbt-bbt_cpu) v,_ = dtrtrs(LL.T,b,lower=False) vvt = np.einsum('md,od->mo',v,v) LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') + # + v_gpu = gpuarray.empty((num_inducing,output_dim),np.float64) + cublas.cublasDcopy(self.cublas_handle, v_gpu.size, b_gpu.gpudata, 1, v_gpu.gpudata, 1) + cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, v_gpu.gpudata, num_inducing) + vvt_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, output_dim, np.float64(1.0), v_gpu.gpudata, num_inducing, v_gpu.gpudata, num_inducing, np.float64(0.), vvt_gpu.gpudata, num_inducing) + LmInvPsi2LmInvT_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, LmInvPsi2LmInvT_gpu.gpudata, 1) + cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) + cublas.cublasDtrsm(self.cublas_handle , 'R', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) + tr_LmInvPsi2LmInvT = cublas.cublasDasum(self.cublas_handle, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing+1) + print np.abs(vvt-vvt_gpu.get()).max() + print np.abs(np.trace(LmInvPsi2LmInvT)-tr_LmInvPsi2LmInvT) Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0] KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0] KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T + # + KmmInvPsi2LLInvT_gpu = LmInvPsi2LmInvT_gpu # Reuse GPU memory (size:MxM) + cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, KmmInvPsi2LLInvT_gpu.gpudata, 1) + cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) + cublas.cublasDtrsm(self.cublas_handle , 'R', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) + cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) + KmmInvPsi2P_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + cublas.cublasDcopy(self.cublas_handle, KmmInvPsi2LLInvT_gpu.size, KmmInvPsi2LLInvT_gpu.gpudata, 1, KmmInvPsi2P_gpu.gpudata, 1) + cublas.cublasDtrsm(self.cublas_handle , 'R', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2P_gpu.gpudata, num_inducing) + print np.abs(KmmInvPsi2P-KmmInvPsi2P_gpu.get()).max() dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2 From 88277f6b67392a87d6d84e50dd8784da07d508f2 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Mon, 24 Mar 2014 17:17:06 +0000 Subject: [PATCH 03/33] [GPU] inference function part1 --- .../latent_function_inference/__init__.py | 1 + .../latent_function_inference/var_dtc_gpu.py | 38 ++++++++++++++----- GPy/models/bayesian_gplvm.py | 3 ++ 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py index ee459a76..effa077c 100644 --- a/GPy/inference/latent_function_inference/__init__.py +++ b/GPy/inference/latent_function_inference/__init__.py @@ -32,6 +32,7 @@ from expectation_propagation import EP from dtc import DTC from fitc import FITC from var_dtc_parallel import VarDTC_minibatch +from var_dtc_gpu import VarDTC_GPU # class FullLatentFunctionData(object): # diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 36475fbb..b4ed2e44 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -14,6 +14,7 @@ try: import pycuda.gpuarray as gpuarray from scikits.cuda import cublas import pycuda.autoinit + from pycuda.reduction import ReductionKernel except: print 'Error in importing GPU modules!' @@ -133,10 +134,8 @@ class VarDTC_GPU(object): psi2_full *= beta YRY_full = trYYT*beta - psi0_gpu = gpuarray.to_gpu(np.asfortranarray(psi0_full)) psi1Y_gpu = gpuarray.to_gpu(np.asfortranarray(psi1Y_full)) psi2_gpu = gpuarray.to_gpu(np.asfortranarray(psi2_full)) - YRY_gpu = gpuarray.to_gpu(np.asfortranarray(YRY_full)) #====================================================================== # Compute Common Components @@ -172,7 +171,7 @@ class VarDTC_GPU(object): b_gpu = gpuarray.empty((num_inducing,output_dim),np.float64) cublas.cublasDcopy(self.cublas_handle, b_gpu.size, psi1Y_gpu.gpudata, 1, b_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, b_gpu.gpudata, num_inducing) - bbt = cublas.cublasDdot(self.cublas_handle, b_gpu.size, b_gpu, 1, b_gpu, 1) + bbt = cublas.cublasDdot(self.cublas_handle, b_gpu.size, b_gpu.gpudata, 1, b_gpu.gpudata, 1) print np.abs(bbt-bbt_cpu) v,_ = dtrtrs(LL.T,b,lower=False) @@ -187,7 +186,7 @@ class VarDTC_GPU(object): LmInvPsi2LmInvT_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, LmInvPsi2LmInvT_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) - cublas.cublasDtrsm(self.cublas_handle , 'R', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) + cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) tr_LmInvPsi2LmInvT = cublas.cublasDasum(self.cublas_handle, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing+1) print np.abs(vvt-vvt_gpu.get()).max() print np.abs(np.trace(LmInvPsi2LmInvT)-tr_LmInvPsi2LmInvT) @@ -200,18 +199,26 @@ class VarDTC_GPU(object): KmmInvPsi2LLInvT_gpu = LmInvPsi2LmInvT_gpu # Reuse GPU memory (size:MxM) cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, KmmInvPsi2LLInvT_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) - cublas.cublasDtrsm(self.cublas_handle , 'R', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) + cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) KmmInvPsi2P_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) cublas.cublasDcopy(self.cublas_handle, KmmInvPsi2LLInvT_gpu.size, KmmInvPsi2LLInvT_gpu.gpudata, 1, KmmInvPsi2P_gpu.gpudata, 1) - cublas.cublasDtrsm(self.cublas_handle , 'R', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2P_gpu.gpudata, num_inducing) + cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2P_gpu.gpudata, num_inducing) print np.abs(KmmInvPsi2P-KmmInvPsi2P_gpu.get()).max() dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2 + dL_dpsi2R_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + cublas.cublasDcopy(self.cublas_handle, vvt_gpu.size, vvt_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) + cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) + cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1) + print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() + # Cache intermediate results - self.midRes['dL_dpsi2R'] = dL_dpsi2R - self.midRes['v'] = v + self.midRes['dL_dpsi2R'] = dL_dpsi2R_gpu + self.midRes['v'] = v_gpu + + logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") #====================================================================== # Compute log-likelihood @@ -220,19 +227,30 @@ class VarDTC_GPU(object): logL_R = -np.log(beta).sum() else: logL_R = -num_data*np.log(beta) - logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) + logL_old = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) + + logdetKmm = logDiagSum(Lm_gpu,num_inducing+1) + logdetLambda = logDiagSum(LL_gpu,num_inducing+1) + logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-tr_LmInvPsi2LmInvT)+YRY_full-bbt)/2.+output_dim*(logdetKmm-logdetLambda) + print np.abs(logL_old - logL) #====================================================================== # Compute dL_dKmm #====================================================================== dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2. + # + dL_dKmm_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, output_dim, np.float64(1.0), KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, np.float64(0.), dL_dKmm_gpu.gpudata, num_inducing) + cublas.cublasDaxpy(self.cublas_handle, dL_dKmm_gpu.size, np.float64(1./output_dim), vvt_gpu.gpudata, 1, dL_dKmm_gpu.gpudata, 1) + cublas.cublasDscal(self.cublas_handle, dL_dKmm_gpu.size, np.float64(-output_dim/2.), dL_dpsi2R_gpu.gpudata, 1) + print np.abs(dL_dKmm - dL_dKmm_gpu.get()).max() #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== - post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm) + post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm.get()) return logL, dL_dKmm, post diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index fb821d64..95230f54 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -67,6 +67,9 @@ class BayesianGPLVM(SparseGP): X.mean.gradient, X.variance.gradient = X_grad def parameters_changed(self): + update_gradients(self) + return + super(BayesianGPLVM, self).parameters_changed() self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X) From b5b17b9715286775d69fa1e7058d544d3eed536c Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Mon, 24 Mar 2014 18:23:11 +0000 Subject: [PATCH 04/33] [GPU] finish infere_likelihood --- .../latent_function_inference/var_dtc_gpu.py | 68 +++++++++++++------ 1 file changed, 48 insertions(+), 20 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index b4ed2e44..669d8b97 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -15,6 +15,7 @@ try: from scikits.cuda import cublas import pycuda.autoinit from pycuda.reduction import ReductionKernel + from ...util.linalg_gpu import logDiagSum except: print 'Error in importing GPU modules!' @@ -44,6 +45,27 @@ class VarDTC_GPU(object): # Initialize GPU environment culinalg.init() self.cublas_handle = cublas.cublasCreate() + + # Initialize GPU caches + self.gpuCache = None + + def _initGPUCache(self, num_inducing, output_dim): + if self.gpuCache == None: + self.gpuCache = {# inference_likelihood + 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'ones_gpu' :gpuarray.empty(num_inducing, np.float64), + 'LL_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'b_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64), + 'v_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64), + 'vvt_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'KmmInvPsi2LLInvT_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'KmmInvPsi2P_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'dL_dpsi2R_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'dL_dKmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + # inference_minibatch + } + self.gpuCache['ones_gpu'].fill(1.0) def set_limit(self, limit): self.get_trYYT.limit = limit @@ -74,6 +96,8 @@ class VarDTC_GPU(object): num_inducing = Z.shape[0] num_data, output_dim = Y.shape + + self._initGPUCache(num_inducing, output_dim) if isinstance(X, VariationalPosterior): uncertain_inputs = True @@ -142,33 +166,34 @@ class VarDTC_GPU(object): #====================================================================== Kmm = kern.K(Z).copy() - Kmm_gpu = gpuarray.to_gpu(np.asfortranarray(Kmm)) + Kmm_gpu = self.gpuCache['Kmm_gpu'] + Kmm_gpu.set(Kmm) diag.add(Kmm, self.const_jitter) - ones_gpu = gpuarray.empty(num_inducing, np.float64) - ones_gpu.fill(1.0) + ones_gpu = self.gpuCache['ones_gpu'] cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1) assert np.allclose(Kmm, Kmm_gpu.get()) Lm = jitchol(Kmm) # - Lm_gpu = Kmm_gpu.copy() + Lm_gpu = self.gpuCache['Lm_gpu'] + cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lm_gpu.gpudata, 1) culinalg.cho_factor(Lm_gpu,'L') print np.abs(np.tril(Lm)-np.tril(Lm_gpu.get())).max() Lambda = Kmm+psi2_full LL = jitchol(Lambda) # - Lambda_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + Lambda_gpu = self.gpuCache['LL_gpu'] cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lambda_gpu.gpudata, 1) cublas.cublasDaxpy(self.cublas_handle, psi2_gpu.size, np.float64(1.0), psi2_gpu.gpudata, 1, Lambda_gpu.gpudata, 1) - LL_gpu = Lambda_gpu.copy() + LL_gpu = Lambda_gpu culinalg.cho_factor(LL_gpu,'L') print np.abs(np.tril(LL)-np.tril(LL_gpu.get())).max() b,_ = dtrtrs(LL, psi1Y_full) bbt_cpu = np.square(b).sum() # - b_gpu = gpuarray.empty((num_inducing,output_dim),np.float64) + b_gpu = self.gpuCache['b_gpu'] cublas.cublasDcopy(self.cublas_handle, b_gpu.size, psi1Y_gpu.gpudata, 1, b_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, b_gpu.gpudata, num_inducing) bbt = cublas.cublasDdot(self.cublas_handle, b_gpu.size, b_gpu.gpudata, 1, b_gpu.gpudata, 1) @@ -178,12 +203,12 @@ class VarDTC_GPU(object): vvt = np.einsum('md,od->mo',v,v) LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') # - v_gpu = gpuarray.empty((num_inducing,output_dim),np.float64) + v_gpu = self.gpuCache['v_gpu'] cublas.cublasDcopy(self.cublas_handle, v_gpu.size, b_gpu.gpudata, 1, v_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, v_gpu.gpudata, num_inducing) - vvt_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + vvt_gpu = self.gpuCache['vvt_gpu'] cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, output_dim, np.float64(1.0), v_gpu.gpudata, num_inducing, v_gpu.gpudata, num_inducing, np.float64(0.), vvt_gpu.gpudata, num_inducing) - LmInvPsi2LmInvT_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + LmInvPsi2LmInvT_gpu = self.gpuCache['KmmInvPsi2LLInvT_gpu'] cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, LmInvPsi2LmInvT_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) @@ -201,24 +226,24 @@ class VarDTC_GPU(object): cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) - KmmInvPsi2P_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + KmmInvPsi2P_gpu = self.gpuCache['KmmInvPsi2P_gpu'] cublas.cublasDcopy(self.cublas_handle, KmmInvPsi2LLInvT_gpu.size, KmmInvPsi2LLInvT_gpu.gpudata, 1, KmmInvPsi2P_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2P_gpu.gpudata, num_inducing) print np.abs(KmmInvPsi2P-KmmInvPsi2P_gpu.get()).max() dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2 - dL_dpsi2R_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) + # + dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] cublas.cublasDcopy(self.cublas_handle, vvt_gpu.size, vvt_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1) print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() - - + # Cache intermediate results - self.midRes['dL_dpsi2R'] = dL_dpsi2R_gpu - self.midRes['v'] = v_gpu + self.midRes['dL_dpsi2R'] = dL_dpsi2R + self.midRes['v'] = v - logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") + #logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") #====================================================================== # Compute log-likelihood @@ -240,10 +265,10 @@ class VarDTC_GPU(object): dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2. # - dL_dKmm_gpu = gpuarray.empty((num_inducing,num_inducing),np.float64) - cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, output_dim, np.float64(1.0), KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, np.float64(0.), dL_dKmm_gpu.gpudata, num_inducing) + dL_dKmm_gpu = self.gpuCache['dL_dKmm_gpu'] + cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, num_inducing, np.float64(1.0), KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, np.float64(0.), dL_dKmm_gpu.gpudata, num_inducing) cublas.cublasDaxpy(self.cublas_handle, dL_dKmm_gpu.size, np.float64(1./output_dim), vvt_gpu.gpudata, 1, dL_dKmm_gpu.gpudata, 1) - cublas.cublasDscal(self.cublas_handle, dL_dKmm_gpu.size, np.float64(-output_dim/2.), dL_dpsi2R_gpu.gpudata, 1) + cublas.cublasDscal(self.cublas_handle, dL_dKmm_gpu.size, np.float64(-output_dim/2.), dL_dKmm_gpu.gpudata, 1) print np.abs(dL_dKmm - dL_dKmm_gpu.get()).max() #====================================================================== @@ -303,6 +328,9 @@ class VarDTC_GPU(object): betaY = beta*Y_slice betapsi1 = np.einsum('n,nm->nm',beta,psi1) + betaY_gpu = gpuarray.to_gpu(betaY) + betapsi1_gpu = gpuarray.to_gpu(betapsi1) + #====================================================================== # Load Intermediate Results #====================================================================== From 53627ee2826ecb415ca268ac0cb5e7ae853a9c18 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 26 Mar 2014 10:47:33 +0000 Subject: [PATCH 05/33] [GPU] GPU kernel --- GPy/core/parameterization/variational.py | 7 ++- .../latent_function_inference/var_dtc_gpu.py | 2 +- .../var_dtc_parallel.py | 50 ++++++++++++++++ GPy/kern/_src/rbf.py | 4 ++ GPy/models/bayesian_gplvm.py | 57 ++----------------- GPy/models/ss_gplvm.py | 12 +++- GPy/plotting/matplot_dep/variational_plots.py | 18 ++++-- 7 files changed, 90 insertions(+), 60 deletions(-) diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py index ce39e2c9..ac1dfc63 100644 --- a/GPy/core/parameterization/variational.py +++ b/GPy/core/parameterization/variational.py @@ -40,6 +40,7 @@ class SpikeAndSlabPrior(VariationalPrior): self.pi = Param('pi', pi, Logistic(1e-10,1.-1e-10)) self.variance = Param('variance',variance) self.add_parameters(self.pi) + self.group_spike_prob = False def KL_divergence(self, variational_posterior): mu = variational_posterior.mean @@ -55,7 +56,11 @@ class SpikeAndSlabPrior(VariationalPrior): S = variational_posterior.variance gamma = variational_posterior.binary_prob - gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2. + if self.group_spike_prob: + gamma_grad = np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2. + gamma.gradient -= gamma_grad.mean(axis=0) + else: + gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2. mu.gradient -= gamma*mu S.gradient -= (1. - (1. / (S))) * gamma /2. self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 669d8b97..ba7ec602 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -17,7 +17,7 @@ try: from pycuda.reduction import ReductionKernel from ...util.linalg_gpu import logDiagSum except: - print 'Error in importing GPU modules!' + pass class VarDTC_GPU(object): """ diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py index bb69b88d..4b29b16a 100644 --- a/GPy/inference/latent_function_inference/var_dtc_parallel.py +++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py @@ -279,4 +279,54 @@ class VarDTC_minibatch(object): 'dL_dthetaL':dL_dthetaL} return isEnd, (n_start,n_end), grad_dict + + +def update_gradients(model): + model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, model.X, model.Z, model.likelihood, model.Y) + het_noise = model.likelihood.variance.size > 1 + + if het_noise: + dL_dthetaL = np.empty((model.Y.shape[0],)) + else: + dL_dthetaL = 0 + + #gradients w.r.t. kernel + model.kern.update_gradients_full(dL_dKmm, model.Z, None) + kern_grad = model.kern.gradient.copy() + + #gradients w.r.t. Z + model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z) + + isEnd = False + while not isEnd: + isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y) + if isinstance(model.X, VariationalPosterior): + + #gradients w.r.t. kernel + model.kern.update_gradients_expectations(variational_posterior=model.X[n_range[0]:n_range[1]], Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) + kern_grad += model.kern.gradient + + #gradients w.r.t. Z + model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations( + grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=model.X[n_range[0]:n_range[1]]) + + #gradients w.r.t. posterior parameters of X + X_grad = model.kern.gradients_qX_expectations(variational_posterior=model.X[n_range[0]:n_range[1]], Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) + model.set_X_gradients(model.X[n_range[0]:n_range[1]], X_grad) + + if het_noise: + dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL'] + else: + dL_dthetaL += grad_dict['dL_dthetaL'] + + # Set the gradients w.r.t. kernel + model.kern.gradient = kern_grad + + # Update Log-likelihood + model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X) + # update for the KL divergence + model.variational_prior.update_gradients_KL(model.X) + + # dL_dthetaL + model.likelihood.update_gradients(dL_dthetaL) diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index c2877d06..3ffe1f5b 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -22,6 +22,7 @@ class RBF(Stationary): def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf'): super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name) self.weave_options = {} + self.group_spike_prob = False def K_of_r(self, r): return self.variance * np.exp(-0.5 * r**2) @@ -158,6 +159,9 @@ class RBF(Stationary): grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1) grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1) grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1) + + if self.group_spike_prob: + grad_gamma[:] = grad_gamma.mean(axis=0) return grad_mu, grad_S, grad_gamma diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index 95230f54..974d3d61 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -9,6 +9,8 @@ from ..likelihoods import Gaussian from ..inference.optimization import SCG from ..util import linalg from ..core.parameterization.variational import NormalPosterior, NormalPrior,VariationalPosterior +from ..inference.latent_function_inference.var_dtc_parallel import update_gradients +from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU class BayesianGPLVM(SparseGP): """ @@ -67,8 +69,9 @@ class BayesianGPLVM(SparseGP): X.mean.gradient, X.variance.gradient = X_grad def parameters_changed(self): - update_gradients(self) - return + if isinstance(self.inference_method, VarDTC_GPU): + update_gradients(self) + return super(BayesianGPLVM, self).parameters_changed() self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X) @@ -158,57 +161,7 @@ class BayesianGPLVM(SparseGP): from ..plotting.matplot_dep import dim_reduction_plots return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs) - - -def update_gradients(model): - model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, model.X, model.Z, model.likelihood, model.Y) - het_noise = model.likelihood.variance.size > 1 - - if het_noise: - dL_dthetaL = np.empty((model.Y.shape[0],)) - else: - dL_dthetaL = 0 - - #gradients w.r.t. kernel - model.kern.update_gradients_full(dL_dKmm, model.Z, None) - kern_grad = model.kern.gradient.copy() - - #gradients w.r.t. Z - model.Z.gradient[:,model.kern.active_dims] = model.kern.gradients_X(dL_dKmm, model.Z) - - isEnd = False - while not isEnd: - isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y) - if isinstance(model.X, VariationalPosterior): - - #gradients w.r.t. kernel - model.kern.update_gradients_expectations(variational_posterior=model.X[n_range[0]:n_range[1]], Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) - kern_grad += model.kern.gradient - - #gradients w.r.t. Z - model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations( - grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=model.X[n_range[0]:n_range[1]]) - - #gradients w.r.t. posterior parameters of X - X_grad = model.kern.gradients_qX_expectations(variational_posterior=model.X[n_range[0]:n_range[1]], Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) - model.set_X_gradients(model.X[n_range[0]:n_range[1]], X_grad) - - if het_noise: - dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL'] - else: - dL_dthetaL += grad_dict['dL_dthetaL'] - - # Set the gradients w.r.t. kernel - model.kern.gradient = kern_grad - - # Update Log-likelihood - model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X) - # update for the KL divergence - model.variational_prior.update_gradients_KL(model.X) - - # dL_dthetaL - model.likelihood.update_gradients(dL_dthetaL) def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2): """ diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py index 1c2ecf4c..e32745c7 100644 --- a/GPy/models/ss_gplvm.py +++ b/GPy/models/ss_gplvm.py @@ -25,7 +25,7 @@ class SSGPLVM(SparseGP): """ def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10, - Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike-and-Slab GPLVM', **kwargs): + Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike-and-Slab GPLVM', group_spike=False, **kwargs): if X == None: # The mean of variational approximation (mu) from ..util.initialization import initialize_latent @@ -38,6 +38,9 @@ class SSGPLVM(SparseGP): gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim) + if group_spike: + gamma[:] = gamma.mean(axis=0) + if Z is None: Z = np.random.permutation(X.copy())[:num_inducing] assert Z.shape[1] == X.shape[1] @@ -47,11 +50,16 @@ class SSGPLVM(SparseGP): if kernel is None: kernel = kern.SSRBF(input_dim) - + pi = np.empty((input_dim)) pi[:] = 0.5 self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b X = SpikeAndSlabPosterior(X, X_variance, gamma) + + if group_spike: + kernel.group_spike_prob = True + self.variational_prior.group_spike_prob = True + SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs) self.add_parameter(self.X, index=0) diff --git a/GPy/plotting/matplot_dep/variational_plots.py b/GPy/plotting/matplot_dep/variational_plots.py index cf00d8a2..27cb4051 100644 --- a/GPy/plotting/matplot_dep/variational_plots.py +++ b/GPy/plotting/matplot_dep/variational_plots.py @@ -45,7 +45,7 @@ def plot(parameterized, fignum=None, ax=None, colors=None): fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95)) return fig -def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None): +def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_side=True): """ Plot latent space X in 1D: @@ -58,7 +58,10 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None): """ if ax is None: - fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1])))) + if side_by_side: + fig = pb.figure(num=fignum, figsize=(16, min(12, (2 * parameterized.mean.shape[1])))) + else: + fig = pb.figure(num=fignum, figsize=(8, min(12, (2 * parameterized.mean.shape[1])))) if colors is None: colors = pb.gca()._get_lines.color_cycle pb.clf() @@ -68,8 +71,15 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None): means, variances, gamma = param_to_array(parameterized.mean, parameterized.variance, parameterized.binary_prob) x = np.arange(means.shape[0]) for i in range(means.shape[1]): + if side_by_side: + sub1 = (means.shape[1],2,2*i+1) + sub2 = (means.shape[1],2,2*i+2) + else: + sub1 = (means.shape[1]*2,1,2*i+1) + sub2 = (means.shape[1]*2,1,2*i+2) + # mean and variance plot - a = fig.add_subplot(means.shape[1]*2, 1, 2*i + 1) + a = fig.add_subplot(*sub1) a.plot(means, c='k', alpha=.3) plots.extend(a.plot(x, means.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i))) a.fill_between(x, @@ -82,7 +92,7 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None): if i < means.shape[1] - 1: a.set_xticklabels('') # binary prob plot - a = fig.add_subplot(means.shape[1]*2, 1, 2*i + 2) + a = fig.add_subplot(*sub2) a.bar(x,gamma[:,i],bottom=0.,linewidth=0,align='center') a.set_xlim(x.min(), x.max()) a.set_ylim([0.,1.]) From e4d19120cd9e4c5871dff4475667c31af16957ff Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 26 Mar 2014 10:54:41 +0000 Subject: [PATCH 06/33] [GPU] add linalg_gpu ssrbf_gpucomp --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 170 ++++++++++++++++++++ GPy/kern/_src/rbf.py | 2 +- GPy/models/bayesian_gplvm.py | 3 +- GPy/util/__init__.py | 1 + GPy/util/linalg_gpu.py | 13 ++ 5 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py create mode 100644 GPy/util/linalg_gpu.py diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py new file mode 100644 index 00000000..2acddae9 --- /dev/null +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -0,0 +1,170 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +""" +The package for the psi statistics computation on GPU +""" + +import numpy as np +from GPy.util.caching import Cache_this + +try: + import scikits.cuda.linalg as culinalg + import pycuda.gpuarray as gpuarray + from scikits.cuda import cublas + import pycuda.autoinit + from pycuda.reduction import ReductionKernel + from ...util.linalg_gpu import logDiagSum + + from pycuda.elementwise import ElementwiseKernel + + # The kernel form computing psi1 + comp_psi1 = ElementwiseKernel( + "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", + "psi1[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, psi1denom, N, M, Q, i)", + "comp_psi1", + preamble=""" + #define IDX_MQ(n,m,q) ((n*M+m)*Q+q) + #define IDX_Q(n,q) (n*Q+q) + + __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) + { + int n = idx/M; + int m = idx%M; + double psi1=0; + for(int q=0;q=exp2?exp1+log(1.0+exp(exp2-exp1)):exp2+log(1.0+exp(exp1-exp2)); + } + return var*exp(psi1); + } + """) +except: + pass + +class PSICOMP_SSRBF(object): + def __init__(self): + pass + +@Cache_this(limit=1) +def _Z_distances(Z): + Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q + Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q + return Zhat, Zdist + +def _psicomputations(variance, lengthscale, Z, mu, S, gamma): + """ + """ + + +@Cache_this(limit=1) +def _psi1computations(variance, lengthscale, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 and psi2 + # Produced intermediate results: + # _psi1 NxM + # _dpsi1_dvariance NxM + # _dpsi1_dlengthscale NxMxQ + # _dpsi1_dZ NxMxQ + # _dpsi1_dgamma NxMxQ + # _dpsi1_dmu NxMxQ + # _dpsi1_dS NxMxQ + + lengthscale2 = np.square(lengthscale) + + # psi1 + _psi1_denom = S[:, None, :] / lengthscale2 + 1. # Nx1xQ + _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ + _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ + _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ + _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ + _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom)) # NxMxQ + _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ + _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) + _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ + _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM + _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ + _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ + _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ + _psi1 = variance * np.exp(_psi1_exp_sum) # NxM + _dpsi1_dvariance = _psi1 / variance # NxM + _dpsi1_dgamma = _psi1_q * (_psi1_exp_dist_sq/_psi1_denom_sqrt-_psi1_exp_Z) # NxMxQ + _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ + _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ + _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ + _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ + + N = mu.shape[0] + M = Z.shape[0] + Q = mu.shape[1] + + l_gpu = gpuarray.to_gpu(lengthscale2) + Z_gpu = gpuarray.to_gpu(Z) + mu_gpu = gpuarray.to_gpu(mu) + S_gpu = gpuarray.to_gpu(S) + #gamma_gpu = gpuarray.to_gpu(gamma) + logGamma_gpu = gpuarray.to_gpu(np.log(gamma)) + log1Gamma_gpu = gpuarray.to_gpu(np.log(1.-gamma)) + logpsi1denom_gpu = gpuarray.to_gpu(np.log(S/lengthscale2+1.)) + psi1_gpu = gpuarray.empty((mu.shape[0],Z.shape[0]),np.float64) + + comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) + + print np.abs(psi1_gpu.get()-_psi1).max() + + return _psi1, _dpsi1_dvariance, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _dpsi1_dZ, _dpsi1_dlengthscale + +@Cache_this(limit=1) +def _psi2computations(variance, lengthscale, Z, mu, S, gamma): + """ + Z - MxQ + mu - NxQ + S - NxQ + gamma - NxQ + """ + # here are the "statistics" for psi1 and psi2 + # Produced intermediate results: + # _psi2 NxMxM + # _psi2_dvariance NxMxM + # _psi2_dlengthscale NxMxMxQ + # _psi2_dZ NxMxMxQ + # _psi2_dgamma NxMxMxQ + # _psi2_dmu NxMxMxQ + # _psi2_dS NxMxMxQ + + lengthscale2 = np.square(lengthscale) + + _psi2_Zhat, _psi2_Zdist = _Z_distances(Z) + _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q + _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ + + # psi2 + _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ + _psi2_denom_sqrt = np.sqrt(_psi2_denom) + _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q + _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom) + _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ + _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q + _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ + _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) + _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) + _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM + _psi2_q = np.square(variance) * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ + _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ + _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ + _psi2 = np.square(variance) * np.exp(_psi2_exp_sum) # N,M,M + _dpsi2_dvariance = 2. * _psi2/variance # NxMxM + _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ + _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ + _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ + _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ + _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ + + return _psi2, _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ, _dpsi2_dlengthscale diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 3ffe1f5b..32d5e1a5 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -8,7 +8,7 @@ from ...util.misc import param_to_array from stationary import Stationary from GPy.util.caching import Cache_this from ...core.parameterization import variational -from psi_comp import ssrbf_psi_comp +from psi_comp import ssrbf_psi_gpucomp class RBF(Stationary): """ diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index 974d3d61..1f01d4d5 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -2,13 +2,12 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -from gplvm import GPLVM from .. import kern from ..core import SparseGP from ..likelihoods import Gaussian from ..inference.optimization import SCG from ..util import linalg -from ..core.parameterization.variational import NormalPosterior, NormalPrior,VariationalPosterior +from ..core.parameterization.variational import NormalPosterior, NormalPrior, VariationalPosterior from ..inference.latent_function_inference.var_dtc_parallel import update_gradients from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py index 8aea990c..77312278 100644 --- a/GPy/util/__init__.py +++ b/GPy/util/__init__.py @@ -15,6 +15,7 @@ import caching import diag import initialization import multioutput +import linalg_gpu try: import sympy diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py new file mode 100644 index 00000000..e066bc04 --- /dev/null +++ b/GPy/util/linalg_gpu.py @@ -0,0 +1,13 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +# +# The utility functions for GPU computation +# +import numpy as np + +try: + from pycuda.reduction import ReductionKernel + logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") +except: \ No newline at end of file From bc59cb8b225597df9e2d23294498e92e9768dbaf Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 26 Mar 2014 17:09:01 +0000 Subject: [PATCH 07/33] [GPU] psi1 after debug --- .../latent_function_inference/var_dtc_gpu.py | 28 ++-- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 152 +++++++++++++++--- GPy/kern/_src/rbf.py | 2 +- GPy/models/ss_gplvm.py | 11 ++ GPy/util/linalg_gpu.py | 4 +- 5 files changed, 157 insertions(+), 40 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index ba7ec602..75a07992 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -52,17 +52,17 @@ class VarDTC_GPU(object): def _initGPUCache(self, num_inducing, output_dim): if self.gpuCache == None: self.gpuCache = {# inference_likelihood - 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'ones_gpu' :gpuarray.empty(num_inducing, np.float64), - 'LL_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'b_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64), - 'v_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64), - 'vvt_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'KmmInvPsi2LLInvT_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'KmmInvPsi2P_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'dL_dpsi2R_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'dL_dKmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'ones_gpu' :gpuarray.empty(num_inducing, np.float64,order='F'), + 'LL_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'b_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'), + 'v_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'), + 'vvt_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'KmmInvPsi2LLInvT_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'KmmInvPsi2P_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'dL_dpsi2R_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'dL_dKmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), # inference_minibatch } self.gpuCache['ones_gpu'].fill(1.0) @@ -134,11 +134,11 @@ class VarDTC_GPU(object): if het_noise: beta_slice = beta[n_start:n_end] psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(psi1,beta_slice[:,None]*Y_slice) # DxM + psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() else: psi0_full += psi0.sum() - psi1Y_full += np.dot(psi1,Y_slice) # DxM + psi1Y_full += np.dot(psi1.T,Y_slice) # MxD if uncertain_inputs: @@ -275,7 +275,7 @@ class VarDTC_GPU(object): # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== - post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm.get()) + post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get()) return logL, dL_dKmm, post diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 2acddae9..467b779d 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -13,32 +13,118 @@ try: import pycuda.gpuarray as gpuarray from scikits.cuda import cublas import pycuda.autoinit - from pycuda.reduction import ReductionKernel - from ...util.linalg_gpu import logDiagSum - + from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel # The kernel form computing psi1 comp_psi1 = ElementwiseKernel( - "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", - "psi1[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, psi1denom, N, M, Q, i)", + "double *psi1, double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", + "psi1[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", "comp_psi1", preamble=""" - #define IDX_MQ(n,m,q) ((n*M+m)*Q+q) - #define IDX_Q(n,q) (n*Q+q) + #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) + #define IDX_NQ(n,q) (q*N+n) + #define IDX_MQ(m,q) (q*M+m) + #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) + + __device__ double comp_psi1_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) + { + int n = idx%N; + int m = idx/N; + double psi1_exp=0; + for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) { - int n = idx/M; - int m = idx%M; - double psi1=0; + int n = idx%N; + int m = idx/N; + double psi1_exp=0; for(int q=0;q=exp2?exp1+log(1.0+exp(exp2-exp1)):exp2+log(1.0+exp(exp1-exp2)); + double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)]; + double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi1denom[IDX_NQ(n,q)] + muZ*muZ/(S[IDX_NQ(n,q)]+l[q]) )/2.0; + double exp2 = log1Gamma[IDX_NQ(n,q)] - Z[IDX_MQ(m,q)]*Z[IDX_MQ(m,q)]/(l[q]*2.0); + psi1_exp += LOGEXPSUM(exp1,exp2); } - return var*exp(psi1); + return var*exp(psi1_exp); + } + """) + + # The kernel form computing psi2 het_noise + comp_psi2_het = ElementwiseKernel( + "double *psi2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q", + "psi2[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", + "comp_psi2", + preamble=""" + #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) + #define IDX_NQ(n,q) (q*N+n) + #define IDX_MQ(m,q) (q*M+m) + #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) + + __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + { + // psi2 (n,m1,m2) + int m2 = idx/(M*N); + int m1 = (idx%(M*N))/N; + int n = idx%N; + + double psi2_exp=0; + for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) + + __device__ double comp_psi1_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + { + // psi2 (n,m1,m2) + int m2 = idx/(M*N); + int m1 = (idx%(M*N))/N; + int n = idx%N; + + double psi2_exp=0; + for(int q=0;q Date: Thu, 27 Mar 2014 17:12:17 +0000 Subject: [PATCH 08/33] [GPU] psicommputation --- GPy/kern/_src/kern.py | 5 +- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 125 ++++++++++++++++++-- GPy/kern/_src/rbf.py | 24 +++- GPy/models/ss_gplvm.py | 4 +- GPy/util/linalg_gpu.py | 14 +++ 5 files changed, 153 insertions(+), 19 deletions(-) diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 31fa8690..be8a15b2 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -16,7 +16,8 @@ class Kern(Parameterized): __metaclass__ = KernCallsViaSlicerMeta #=========================================================================== _debug=False - def __init__(self, input_dim, active_dims, name, *a, **kw): + _support_GPU=False + def __init__(self, input_dim, active_dims, name, useGPU=False,*a, **kw): """ The base class for a kernel: a positive definite function which forms of a covariance function (kernel). @@ -40,6 +41,8 @@ class Kern(Parameterized): active_dim_size = len(self.active_dims) assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims) self._sliced_X = 0 + + self.useGPU = self._support_GPU and useGPU @Cache_this(limit=10) def _slice_X(self, X): diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 467b779d..071d8795 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -15,6 +15,7 @@ try: import pycuda.autoinit from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel + from ....util import linalg_gpu # The kernel form computing psi1 comp_psi1 = ElementwiseKernel( @@ -45,15 +46,15 @@ try: # The kernel form computing psi1 het_noise comp_psi1_het = ElementwiseKernel( "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", - "psi1[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", - "comp_psi1", + "psi1[i] = comp_psi1_element_het(var,l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", + "comp_psi1_het", preamble=""" #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) #define IDX_NQ(n,q) (q*N+n) #define IDX_MQ(m,q) (q*M+m) #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) + __device__ double comp_psi1_element_het(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) { int n = idx%N; int m = idx/N; @@ -71,15 +72,15 @@ try: # The kernel form computing psi2 het_noise comp_psi2_het = ElementwiseKernel( "double *psi2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q", - "psi2[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", - "comp_psi2", + "psi2[i] = comp_psi2_element_het(var,l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", + "comp_psi2_het", preamble=""" #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) #define IDX_NQ(n,q) (q*N+n) #define IDX_MQ(m,q) (q*M+m) #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + __device__ double comp_psi2_element_het(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) { // psi2 (n,m1,m2) int m2 = idx/(M*N); @@ -90,7 +91,7 @@ try: for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi1_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + __device__ double comp_psi2_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) { // psi2 (n,m1,m2) int m2 = idx/(M*N); @@ -120,19 +121,117 @@ try: for(int q=0;q1: + het_noise = True + else: + het_noise = False + + N = mu.shape[0] + M = Z.shape[0] + Q = mu.shape[1] + + self._initGPUCache(N,M,Q) + if het_noise: + l_gpu = self.gpuCache['l_gpu'] + l_gpu.set(np.asfortranarray(lengthscale**2)) + else: + lengthscale2 = lengthscale**2 + + Z_gpu = self.gpuCache['Z_gpu'] + mu_gpu = self.gpuCache['mu_gpu'] + S_gpu = self.gpuCache['S_gpu'] + gamma_gpu = self.gpuCache['gamma_gpu'] + logGamma_gpu = self.gpuCache['logGamma_gpu'] + log1Gamma_gpu = self.gpuCache['log1Gamma_gpu'] + logpsidenom_gpu = self.gpuCache['logpsidenom_gpu'] + psi0_gpu = self.gpuCache['psi0_gpu'] + psi1_gpu = self.gpuCache['psi1_gpu'] + psi2_gpu = self.gpuCache['psi2_gpu'] + + Z_gpu.set(np.asfortranarray(Z)) + mu_gpu.set(np.asfortranarray(mu)) + S_gpu.set(S) + gamma_gpu.set(gamma) + linalg_gpu.log(gamma_gpu,logGamma_gpu) + linalg_gpu.logOne(gamma_gpu,log1Gamma_gpu) + + psi0_gpu.fill(variance) + if het_noise: + comp_logpsidenom_het(logpsidenom_gpu, S_gpu,l_gpu,1.0) + comp_psi1_het(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + comp_logpsidenom_het(logpsidenom_gpu, S_gpu,l_gpu,2.0) + comp_psi2_het(psi2_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + else: + comp_logpsidenom(logpsidenom_gpu, S_gpu,lengthscale2,1.0) + comp_psi1(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + comp_logpsidenom(logpsidenom_gpu, S_gpu,lengthscale2,2.0) + comp_psi2(psi2_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + + return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get() + @Cache_this(limit=1) def _Z_distances(Z): @@ -199,7 +298,7 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): logGamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(gamma))) log1Gamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(1.-gamma))) logpsi1denom_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(S/lengthscale2+1.))) - psi1_gpu = gpuarray.empty((mu.shape[0],Z.shape[0]),np.float64) + psi1_gpu = gpuarray.empty((mu.shape[0],Z.shape[0]),np.float64, order='F') comp_psi1(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) @@ -265,7 +364,7 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma): logGamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(gamma))) log1Gamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(1.-gamma))) logpsi2denom_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(2.*S/lengthscale2+1.))) - psi2_gpu = gpuarray.empty((mu.shape[0],Z.shape[0],Z.shape[0]),np.float64) + psi2_gpu = gpuarray.empty((mu.shape[0],Z.shape[0],Z.shape[0]),np.float64, order='F') comp_psi2(psi2_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 0cf8b8de..20234c99 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -8,7 +8,8 @@ from ...util.misc import param_to_array from stationary import Stationary from GPy.util.caching import Cache_this from ...core.parameterization import variational -from psi_comp import ssrbf_psi_gpucomp as ssrbf_psi_comp +from psi_comp import ssrbf_psi_comp +from psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF class RBF(Stationary): """ @@ -19,10 +20,15 @@ class RBF(Stationary): k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) """ + _support_GPU = True def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf'): super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name) self.weave_options = {} self.group_spike_prob = False + + if self.useGPU: + self.psicomp = PSICOMP_SSRBF() + def K_of_r(self, r): return self.variance * np.exp(-0.5 * r**2) @@ -35,18 +41,28 @@ class RBF(Stationary): #---------------------------------------# def psi0(self, Z, variational_posterior): - return self.Kdiag(variational_posterior.mean) + if self.useGPU: + if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0] + else: + return self.Kdiag(variational_posterior.mean) def psi1(self, Z, variational_posterior): if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + if self.useGPU: + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1] + else: + psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) else: _, _, _, psi1 = self._psi1computations(Z, variational_posterior) return psi1 def psi2(self, Z, variational_posterior): if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + if self.useGPU: + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2] + else: + psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) else: _, _, _, _, psi2 = self._psi2computations(Z, variational_posterior) return psi2 diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py index ec0f032a..eb7c4428 100644 --- a/GPy/models/ss_gplvm.py +++ b/GPy/models/ss_gplvm.py @@ -62,7 +62,9 @@ class SSGPLVM(SparseGP): if group_spike: kernel.group_spike_prob = True self.variational_prior.group_spike_prob = True - + + if isinstance(inference_method, VarDTC_GPU) and self.kern._support_GPU: + self.kern.useGPU = True SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs) self.add_parameter(self.X, index=0) diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 12d5a823..d2528a63 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -10,6 +10,20 @@ import numpy as np try: import pycuda.autoinit from pycuda.reduction import ReductionKernel + from pycuda.elementwise import ElementwiseKernel + + # log|A| for A is a low triangle matrix + # logDiagSum(A, A.shape[0]+1) logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") + + #======================================================================================= + # Element-wise functions + #======================================================================================= + + # log(X) + log = ElementwiseKernel("double *in, double *out", "out[i] = log(in[i])", "log_element") + + # log(1.0-X) + logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element") except: pass From b945e8d01fd86322275c9fdce12f5936a7b4e839 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Mon, 31 Mar 2014 16:18:06 +0100 Subject: [PATCH 09/33] [GPU] psi1 --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 213 ++++++++++---------- GPy/kern/_src/rbf.py | 4 +- GPy/kern/_src/stationary.py | 4 +- GPy/models/ss_gplvm.py | 2 - 4 files changed, 114 insertions(+), 109 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 071d8795..6ad9b20a 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -17,10 +17,11 @@ try: from pycuda.elementwise import ElementwiseKernel from ....util import linalg_gpu - # The kernel form computing psi1 + + # The kernel form computing psi1 het_noise comp_psi1 = ElementwiseKernel( - "double *psi1, double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", - "psi1[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", + "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", + "psi1[i] = comp_psi1_element(var, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", "comp_psi1", preamble=""" #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) @@ -28,33 +29,7 @@ try: #define IDX_MQ(m,q) (q*M+m) #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi1_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) - { - int n = idx%N; - int m = idx/N; - double psi1_exp=0; - for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - - __device__ double comp_psi1_element_het(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) + __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) { int n = idx%N; int m = idx/N; @@ -68,19 +43,19 @@ try: return var*exp(psi1_exp); } """) - + # The kernel form computing psi2 het_noise - comp_psi2_het = ElementwiseKernel( + comp_psi2 = ElementwiseKernel( "double *psi2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q", - "psi2[i] = comp_psi2_element_het(var,l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", - "comp_psi2_het", + "psi2[i] = comp_psi2_element(var, l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", + "comp_psi2", preamble=""" #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) #define IDX_NQ(n,q) (q*N+n) #define IDX_MQ(m,q) (q*M+m) #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi2_element_het(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + __device__ double comp_psi2_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) { // psi2 (n,m1,m2) int m2 = idx/(M*N); @@ -97,74 +72,103 @@ try: } return var*var*exp(psi2_exp); } + """) + + # compute psidenom + comp_logpsidenom = ElementwiseKernel( + "double *out, double *S, double *l, double scale, int N", + "out[i] = comp_logpsidenom_element(S, l, scale, N, i)", + "comp_logpsidenom", + preamble=""" + __device__ double comp_logpsidenom_element(double *S, double *l, double scale, int N, int idx) + { + int q = idx/N; + + return log(scale*S[idx]/l[q]+1.0); + } """) - # The kernel form computing psi2 - comp_psi2 = ElementwiseKernel( - "double *psi2, double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q", - "psi2[i] = comp_psi2_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", - "comp_psi2", + # The kernel form computing psi1 het_noise + comp_dpsi1_dvar = ElementwiseKernel( + "double *dpsi1_dvar, double *psi1_neq, double *psi1exp1, double *psi11exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", + "dpsi1_dvar[i] = comp_dpsi1_dvar_element(psi1_neq, psi1exp1, psi1exp2, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", + "comp_dpsi1_dvar", preamble=""" #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) #define IDX_NQ(n,q) (q*N+n) #define IDX_MQ(m,q) (q*M+m) #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi2_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + __device__ double comp_dpsi1_dvar_element(double *psi1_neq, double *psi1exp1, double *psi11exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) { - // psi2 (n,m1,m2) - int m2 = idx/(M*N); - int m1 = (idx%(M*N))/N; int n = idx%N; - - double psi2_exp=0; - for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) + + __device__ double comp_dpsi1_der_element(double *dpsi1_dmu, double *dpsi1_dS, double *dpsi1_dgamma, double *dpsi1_dZ, double var, double *psi1_neq, double psi1exp1, double *psi11exp2, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q, int idx) { - int q = idx/N; + int q = idx/(M*N); + int m = (idx%(M*N))/N; int n = idx%N; + + double neq = psi1_neq[IDX_NMQ(n,m,q)]; + double gamma_c = gamma[IDX_NQ(n,q)]; + double Z_c = Z[IDX_MQ(m,q)]; + double S_c = S[IDX_NQ(n,q)]; + double l_c = l[q]; + double psi1exp1_c = psi1exp1[IDX_NMQ(n,m,q)]; + double psi1exp2_c = psi1exp2[IDX_MQ(m,q)]; - return scale*S[idx]/l[q]+1.0; + double denom = S_c/l_c+1.0; + double denom_sqrt = sqrt(denom); + double Zmu = Z_c-mu[IDX_NQ(n,q)]; + double psi1_common = gamma_c/(denom_sqrt*denom*l_c); + double gamma1 = 1-gamma_c + + dpsi1_dgamma[IDX_NMQ(n,m,q)] = var*neq*(psi1exp1_c/denom_sqrt - psi1exp2_c); + dpsi1_dmu[IDX_NMQ(n,m,q)] = var*neq*(psi1_common*Zmu*psi1exp1_c); + dpsi1_dS[IDX_NMQ(n,m,q)] = var*neq*(psi1_common*(Zmu*Zmu/(S_c+l_c)-1.0)*psi1exp1_c)/2.0; + dpsi1_dZ[IDX_NMQ(n,m,q)] = var*neq*(-psi1_common*Zmu*psi1exp1_c-gamma1*Z_c/l_c*psi1exp2_c); + return var*neq*(psi1_common*(S_c/l_c+Zmu*Zmu/(S_c+l_c))*psi1exp1_c+gamma1*Z_c*Z_c/l_c*psi1exp2_c)/2.0; } """) - - # compute psidenom - comp_logpsidenom = ElementwiseKernel( - "double *out, double *S, double l, double scale", - "out[i] = comp_logpsidenom_element(S, l, scale, i)", - "comp_logpsidenom", - preamble=""" - __device__ double comp_logpsidenom_element(double *S, double l, double scale, int idx) - { - int q = idx/N; - int n = idx%N; - - return scale*S[idx]/l+1.0; - } - """) - + except: pass class PSICOMP_SSRBF(object): - def __init__(self): + def __init__(self, cublas_handle): + self.cuhandle = cublas_handle self.gpuCache = None def _initGPUCache(self, N, M, Q): @@ -194,12 +198,7 @@ class PSICOMP_SSRBF(object): Q = mu.shape[1] self._initGPUCache(N,M,Q) - if het_noise: - l_gpu = self.gpuCache['l_gpu'] - l_gpu.set(np.asfortranarray(lengthscale**2)) - else: - lengthscale2 = lengthscale**2 - + l_gpu = self.gpuCache['l_gpu'] Z_gpu = self.gpuCache['Z_gpu'] mu_gpu = self.gpuCache['mu_gpu'] S_gpu = self.gpuCache['S_gpu'] @@ -210,26 +209,24 @@ class PSICOMP_SSRBF(object): psi0_gpu = self.gpuCache['psi0_gpu'] psi1_gpu = self.gpuCache['psi1_gpu'] psi2_gpu = self.gpuCache['psi2_gpu'] - + + if het_noise: + l_gpu.set(np.asfortranarray(lengthscale**2)) + else: + l_gpu.fill(lengthscale*lengthscale) Z_gpu.set(np.asfortranarray(Z)) mu_gpu.set(np.asfortranarray(mu)) - S_gpu.set(S) - gamma_gpu.set(gamma) + S_gpu.set(np.asfortranarray(S)) + gamma_gpu.set(np.asfortranarray(gamma)) linalg_gpu.log(gamma_gpu,logGamma_gpu) linalg_gpu.logOne(gamma_gpu,log1Gamma_gpu) psi0_gpu.fill(variance) - if het_noise: - comp_logpsidenom_het(logpsidenom_gpu, S_gpu,l_gpu,1.0) - comp_psi1_het(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) - comp_logpsidenom_het(logpsidenom_gpu, S_gpu,l_gpu,2.0) - comp_psi2_het(psi2_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) - else: - comp_logpsidenom(logpsidenom_gpu, S_gpu,lengthscale2,1.0) - comp_psi1(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) - comp_logpsidenom(logpsidenom_gpu, S_gpu,lengthscale2,2.0) - comp_psi2(psi2_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) - + comp_logpsidenom(logpsidenom_gpu, S_gpu,l_gpu,1.0,N) + comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + comp_logpsidenom(logpsidenom_gpu, S_gpu,l_gpu,2.0,N) + comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get() @@ -290,7 +287,8 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): M = Z.shape[0] Q = mu.shape[1] - l_gpu = gpuarray.to_gpu(np.asfortranarray(lengthscale2)) + l_gpu = gpuarray.gpuarray.empty((Q,),np.float64, order='F') + l_gpu.fill(lengthscale2) Z_gpu = gpuarray.to_gpu(np.asfortranarray(Z)) mu_gpu = gpuarray.to_gpu(np.asfortranarray(mu)) S_gpu = gpuarray.to_gpu(np.asfortranarray(S)) @@ -299,10 +297,19 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): log1Gamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(1.-gamma))) logpsi1denom_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(S/lengthscale2+1.))) psi1_gpu = gpuarray.empty((mu.shape[0],Z.shape[0]),np.float64, order='F') + psi1_neq_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') + psi1exp1_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') + psi1exp2_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') + dpsi1_dvar_gpu = gpuarray.empty((N,M),np.float64, order='F') + dpsi1_dl_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') + dpsi1_dZ_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') + dpsi1_dgamma_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') + dpsi1_dmu_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') + dpsi1_dS_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - comp_psi1(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) + comp_dpsi1_dvar(dpsi1_dvar_gpu,psi1_neq_gpu,psi1exp1_gpu,psi1exp2_gpu, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) - #print np.abs(psi1_gpu.get()-_psi1).max() + print np.abs(dpsi1_dvar_gpu.get()-_dpsi1_dvariance).max() return _psi1, _dpsi1_dvariance, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _dpsi1_dZ, _dpsi1_dlengthscale diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 20234c99..893e5da3 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -21,8 +21,8 @@ class RBF(Stationary): """ _support_GPU = True - def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf'): - super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name) + def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf', useGPU=False): + super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU) self.weave_options = {} self.group_spike_prob = False diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py index b6fea5ef..37acbf2d 100644 --- a/GPy/kern/_src/stationary.py +++ b/GPy/kern/_src/stationary.py @@ -41,8 +41,8 @@ class Stationary(Kern): """ - def __init__(self, input_dim, variance, lengthscale, ARD, active_dims, name): - super(Stationary, self).__init__(input_dim, active_dims, name) + def __init__(self, input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=False): + super(Stationary, self).__init__(input_dim, active_dims, name,useGPU=useGPU) self.ARD = ARD if not ARD: if lengthscale is None: diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py index eb7c4428..55ee573c 100644 --- a/GPy/models/ss_gplvm.py +++ b/GPy/models/ss_gplvm.py @@ -63,8 +63,6 @@ class SSGPLVM(SparseGP): kernel.group_spike_prob = True self.variational_prior.group_spike_prob = True - if isinstance(inference_method, VarDTC_GPU) and self.kern._support_GPU: - self.kern.useGPU = True SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs) self.add_parameter(self.X, index=0) From 98816659dd4cde6fc39b6214946b983f6879335c Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Tue, 1 Apr 2014 12:09:40 +0100 Subject: [PATCH 10/33] [GPU] psi2 ssgplvm --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 140 +++++++++++++++++--- GPy/kern/_src/rbf.py | 4 +- 2 files changed, 123 insertions(+), 21 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 6ad9b20a..ad186594 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -90,7 +90,7 @@ try: # The kernel form computing psi1 het_noise comp_dpsi1_dvar = ElementwiseKernel( - "double *dpsi1_dvar, double *psi1_neq, double *psi1exp1, double *psi11exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", + "double *dpsi1_dvar, double *psi1_neq, double *psi1exp1, double *psi1exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", "dpsi1_dvar[i] = comp_dpsi1_dvar_element(psi1_neq, psi1exp1, psi1exp2, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", "comp_dpsi1_dvar", preamble=""" @@ -99,7 +99,7 @@ try: #define IDX_MQ(m,q) (q*M+m) #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_dpsi1_dvar_element(double *psi1_neq, double *psi1exp1, double *psi11exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) + __device__ double comp_dpsi1_dvar_element(double *psi1_neq, double *psi1exp1, double *psi1exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) { int n = idx%N; int m = idx/N; @@ -107,9 +107,9 @@ try: double psi1_sum = 0; for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_dpsi1_der_element(double *dpsi1_dmu, double *dpsi1_dS, double *dpsi1_dgamma, double *dpsi1_dZ, double var, double *psi1_neq, double psi1exp1, double *psi11exp2, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q, int idx) + __device__ double comp_psi1_der_element(double *dpsi1_dmu, double *dpsi1_dS, double *dpsi1_dgamma, double *dpsi1_dZ, double *psi1_neq, double *psi1exp1, double *psi1exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q, int idx) { int q = idx/(M*N); int m = (idx%(M*N))/N; @@ -146,6 +145,7 @@ try: double Z_c = Z[IDX_MQ(m,q)]; double S_c = S[IDX_NQ(n,q)]; double l_c = l[q]; + double l_sqrt_c = sqrt(l[q]); double psi1exp1_c = psi1exp1[IDX_NMQ(n,m,q)]; double psi1exp2_c = psi1exp2[IDX_MQ(m,q)]; @@ -153,13 +153,101 @@ try: double denom_sqrt = sqrt(denom); double Zmu = Z_c-mu[IDX_NQ(n,q)]; double psi1_common = gamma_c/(denom_sqrt*denom*l_c); - double gamma1 = 1-gamma_c + double gamma1 = 1-gamma_c; dpsi1_dgamma[IDX_NMQ(n,m,q)] = var*neq*(psi1exp1_c/denom_sqrt - psi1exp2_c); dpsi1_dmu[IDX_NMQ(n,m,q)] = var*neq*(psi1_common*Zmu*psi1exp1_c); dpsi1_dS[IDX_NMQ(n,m,q)] = var*neq*(psi1_common*(Zmu*Zmu/(S_c+l_c)-1.0)*psi1exp1_c)/2.0; dpsi1_dZ[IDX_NMQ(n,m,q)] = var*neq*(-psi1_common*Zmu*psi1exp1_c-gamma1*Z_c/l_c*psi1exp2_c); - return var*neq*(psi1_common*(S_c/l_c+Zmu*Zmu/(S_c+l_c))*psi1exp1_c+gamma1*Z_c*Z_c/l_c*psi1exp2_c)/2.0; + return var*neq*(psi1_common*(S_c/l_c+Zmu*Zmu/(S_c+l_c))*psi1exp1_c+gamma1*Z_c*Z_c/l_c*psi1exp2_c)*l_sqrt_c; + } + """) + + # The kernel form computing psi1 het_noise + comp_dpsi2_dvar = ElementwiseKernel( + "double *dpsi2_dvar, double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q", + "dpsi2_dvar[i] = comp_dpsi2_dvar_element(psi2_neq, psi2exp1, psi2exp2, var, l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", + "comp_dpsi2_dvar", + preamble=""" + #define IDX_NMMQ(n,m1,m2,q) (((q*M+m2)*M+m1)*N+n) + #define IDX_MMQ(m1,m2,q) ((q*M+m2)*M+m1) + #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) + #define IDX_NQ(n,q) (q*N+n) + #define IDX_MQ(m,q) (q*M+m) + #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) + + __device__ double comp_dpsi2_dvar_element(double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + { + // psi2 (n,m1,m2) + int m2 = idx/(M*N); + int m1 = (idx%(M*N))/N; + int n = idx%N; + + double psi2_sum=0; + for(int q=0;q Date: Tue, 1 Apr 2014 17:38:52 +0100 Subject: [PATCH 11/33] [gpu] upate gradient --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 156 +++++++++++++++++--- GPy/kern/_src/rbf.py | 17 ++- GPy/util/linalg_gpu.py | 18 +++ 3 files changed, 171 insertions(+), 20 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index ad186594..b116d9cc 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -231,8 +231,8 @@ try: double S_c = S[IDX_NQ(n,q)]; double l_c = l[q]; double l_sqrt_c = sqrt(l[q]); - double psi2exp1_c = psi1exp1[IDX_NMMQ(n,m1,m2,q)]; - double psi2exp2_c = psi1exp2[IDX_MMQ(m1,m2,q)]; + double psi2exp1_c = psi2exp1[IDX_NMMQ(n,m1,m2,q)]; + double psi2exp2_c = psi2exp2[IDX_MMQ(m1,m2,q)]; double dZ = Z1_c - Z2_c; double muZ = mu[IDX_NQ(n,q)] - (Z1_c+Z2_c)/2.0; @@ -246,7 +246,7 @@ try: dpsi2_dgamma[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2exp1_c/denom_sqrt - psi2exp2_c); dpsi2_dmu[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(-2.0*psi2_common*muZ*psi2exp1_c); dpsi2_dS[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(2.0*muZ*muZ/(2.0*S_c+l_c)-1.0)*psi2exp1_c); - dpsi2_dZ[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(dZ*denom/-2.0+muZ)*psi2exp1_c-gamma1*Z_c/l_c*psi2exp2_c)*2.0; + dpsi2_dZ[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(dZ*denom/-2.0+muZ)*psi2exp1_c-gamma1*Z1_c/l_c*psi2exp2_c)*2.0; return var2*neq*(psi2_common*(S_c/l_c+dZ*dZ*denom/(4.0*l_c)+muZ*muZ/(2.0*S_c+l_c))*psi2exp1_c+gamma1*Z2/(2.0*l_c)*psi2exp2_c)*l_sqrt_c*2.0; } """) @@ -255,8 +255,8 @@ except: pass class PSICOMP_SSRBF(object): - def __init__(self, cublas_handle): - self.cuhandle = cublas_handle + def __init__(self): + self.cublas_handle = cublas.cublasCreate() self.gpuCache = None def _initGPUCache(self, N, M, Q): @@ -269,17 +269,45 @@ class PSICOMP_SSRBF(object): 'gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), 'logGamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), 'log1Gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), - 'logpsidenom_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), + 'logpsi1denom_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), + 'logpsi2denom_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), 'psi0_gpu' :gpuarray.empty((N,),np.float64,order='F'), 'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'), 'psi2_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'), + # derivatives psi1 + 'psi1_neq_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'), + 'psi1exp1_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'), + 'psi1exp2_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'), + 'dpsi1_dvar_gpu' :gpuarray.empty((N,M),np.float64, order='F'), + 'dpsi1_dl_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'), + 'dpsi1_dZ_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'), + 'dpsi1_dgamma_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'), + 'dpsi1_dmu_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'), + 'dpsi1_dS_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'), + # derivatives psi2 + 'psi2_neq_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'), + 'psi2exp1_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'), + 'psi2exp2_gpu' :gpuarray.empty((M,M,Q),np.float64, order='F'), + 'dpsi2_dvar_gpu' :gpuarray.empty((N,M,M),np.float64, order='F'), + 'dpsi2_dl_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'), + 'dpsi2_dZ_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'), + 'dpsi2_dgamma_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'), + 'dpsi2_dmu_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'), + 'dpsi2_dS_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'), + # gradients + 'grad_l_gpu' :gpuarray.empty((Q,),np.float64,order='F'), + 'grad_Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'), + 'grad_mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), + 'grad_S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), + 'grad_gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), } - + def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): + """Compute Psi statitsitcs""" if isinstance(lengthscale, np.ndarray) and len(lengthscale)>1: - het_noise = True + ARD = True else: - het_noise = False + ARD = False N = mu.shape[0] M = Z.shape[0] @@ -293,12 +321,13 @@ class PSICOMP_SSRBF(object): gamma_gpu = self.gpuCache['gamma_gpu'] logGamma_gpu = self.gpuCache['logGamma_gpu'] log1Gamma_gpu = self.gpuCache['log1Gamma_gpu'] - logpsidenom_gpu = self.gpuCache['logpsidenom_gpu'] + logpsi1denom_gpu = self.gpuCache['logpsi1denom_gpu'] + logpsi2denom_gpu = self.gpuCache['logpsi2denom_gpu'] psi0_gpu = self.gpuCache['psi0_gpu'] psi1_gpu = self.gpuCache['psi1_gpu'] psi2_gpu = self.gpuCache['psi2_gpu'] - if het_noise: + if ARD: l_gpu.set(np.asfortranarray(lengthscale**2)) else: l_gpu.fill(lengthscale*lengthscale) @@ -308,15 +337,106 @@ class PSICOMP_SSRBF(object): gamma_gpu.set(np.asfortranarray(gamma)) linalg_gpu.log(gamma_gpu,logGamma_gpu) linalg_gpu.logOne(gamma_gpu,log1Gamma_gpu) + comp_logpsidenom(logpsi1denom_gpu, S_gpu,l_gpu,1.0,N) + comp_logpsidenom(logpsi2denom_gpu, S_gpu,l_gpu,2.0,N) - psi0_gpu.fill(variance) - comp_logpsidenom(logpsidenom_gpu, S_gpu,l_gpu,1.0,N) - comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) - comp_logpsidenom(logpsidenom_gpu, S_gpu,l_gpu,2.0,N) - comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) + comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get() + + def _psiDercomputations(self, variance, lengthscale, Z, mu, S, gamma): + """Compute the derivatives w.r.t. Psi statistics""" + N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1] + self._initGPUCache(N,M,Q) + l_gpu = self.gpuCache['l_gpu'] + Z_gpu = self.gpuCache['Z_gpu'] + mu_gpu = self.gpuCache['mu_gpu'] + S_gpu = self.gpuCache['S_gpu'] + gamma_gpu = self.gpuCache['gamma_gpu'] + logGamma_gpu = self.gpuCache['logGamma_gpu'] + log1Gamma_gpu = self.gpuCache['log1Gamma_gpu'] + logpsi1denom_gpu = self.gpuCache['logpsi1denom_gpu'] + logpsi2denom_gpu = self.gpuCache['logpsi2denom_gpu'] + + psi1_neq_gpu = self.gpuCache['psi1_neq_gpu'] + psi1exp1_gpu = self.gpuCache['psi1exp1_gpu'] + psi1exp2_gpu = self.gpuCache['psi1exp2_gpu'] + dpsi1_dvar_gpu = self.gpuCache['dpsi1_dvar_gpu'] + dpsi1_dl_gpu = self.gpuCache['dpsi1_dl_gpu'] + dpsi1_dZ_gpu = self.gpuCache['dpsi1_dZ_gpu'] + dpsi1_dgamma_gpu = self.gpuCache['dpsi1_dgamma_gpu'] + dpsi1_dmu_gpu = self.gpuCache['dpsi1_dmu_gpu'] + dpsi1_dS_gpu = self.gpuCache['dpsi1_dS_gpu'] + + psi2_neq_gpu = self.gpuCache['psi2_neq_gpu'] + psi2exp1_gpu = self.gpuCache['psi2exp1_gpu'] + psi2exp2_gpu = self.gpuCache['psi2exp2_gpu'] + dpsi2_dvar_gpu = self.gpuCache['dpsi2_dvar_gpu'] + dpsi2_dl_gpu = self.gpuCache['dpsi2_dl_gpu'] + dpsi2_dZ_gpu = self.gpuCache['dpsi2_dZ_gpu'] + dpsi2_dgamma_gpu = self.gpuCache['dpsi2_dgamma_gpu'] + dpsi2_dmu_gpu = self.gpuCache['dpsi2_dmu_gpu'] + dpsi2_dS_gpu = self.gpuCache['dpsi2_dS_gpu'] + + #========================================================================================================== + # Assuming the l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, logGamma_gpu, log1Gamma_gpu, + # logpsi1denom_gpu, logpsi2denom_gpu has been synchonized. + #========================================================================================================== + + # psi1 derivatives + comp_dpsi1_dvar(dpsi1_dvar_gpu, psi1_neq_gpu, psi1exp1_gpu,psi1exp2_gpu, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) + comp_psi1_der(dpsi1_dl_gpu,dpsi1_dmu_gpu,dpsi1_dS_gpu,dpsi1_dgamma_gpu, dpsi1_dZ_gpu, psi1_neq_gpu,psi1exp1_gpu,psi1exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q) + + # psi2 derivatives + comp_dpsi2_dvar(dpsi2_dvar_gpu, psi2_neq_gpu, psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) + comp_psi2_der(dpsi2_dl_gpu,dpsi2_dmu_gpu,dpsi2_dS_gpu,dpsi2_dgamma_gpu, dpsi2_dZ_gpu, psi2_neq_gpu,psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q) + + def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma) + N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1] + + if isinstance(lengthscale, np.ndarray) and len(lengthscale)>1: + ARD = True + else: + ARD = False + + dpsi1_dvar_gpu = self.gpuCache['dpsi1_dvar_gpu'] + dpsi2_dvar_gpu = self.gpuCache['dpsi2_dvar_gpu'] + dpsi1_dl_gpu = self.gpuCache['dpsi1_dl_gpu'] + dpsi2_dl_gpu = self.gpuCache['dpsi2_dl_gpu'] + psi1_comb_gpu = self.gpuCache['psi1_neq_gpu'] + psi2_comb_gpu = self.gpuCache['psi1_neq_gpu'] + grad_dl_gpu = self.gpuCache['grad_l_gpu'] + + # variance + variance.gradient = cublas.cublasDasum(self.cublas_handle, dL_dpsi0.size, dL_dpsi0, 1) \ + + cublas.cublasDdot(self.cublas_handle, dL_dpsi1.size, dL_dpsi1.gpudata, 1, dpsi1_dvar_gpu.gpudata, 1) \ + + cublas.cublasDdot(self.cublas_handle, dL_dpsi2.size, dL_dpsi2.gpudata, 1, dpsi2_dvar_gpu.gpudata, 1) + + # lengscale + if ARD: + grad_dl_gpu.fill(0.) + linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size) + linalg_gpu.sum_axis(grad_dl_gpu, psi1_comb_gpu, 1, N*M) + linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size) + linalg_gpu.sum_axis(grad_dl_gpu, psi2_comb_gpu, 1, N*M*M) + lengthscale.gradient = grad_dl_gpu.get() + else: + linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size) + linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size) + lengthscale.gradient = cublas.cublasDasum(self.cublas_handle, psi1_comb_gpu.size, psi1_comb_gpu, 1) \ + + cublas.cublasDasum(self.cublas_handle, psi2_comb_gpu.size, psi2_comb_gpu, 1) + + def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): + pass + + def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): + pass @Cache_this(limit=1) def _Z_distances(Z): @@ -474,9 +594,9 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma): #comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) - comp_dpsi2_dvar(dpsi2_dvar_gpu,psi2_neq_gpu,psi2exp1_gpu,psi2exp2_gpu, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) + comp_dpsi2_dvar(dpsi2_dvar_gpu,psi2_neq_gpu,psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) comp_psi2_der(dpsi2_dl_gpu,dpsi2_dmu_gpu,dpsi2_dS_gpu,dpsi2_dgamma_gpu, dpsi2_dZ_gpu, psi2_neq_gpu,psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q) - print np.abs(dpsi2_dvar_gpu.get()-_dpsi2_dvariance).max() +# print np.abs(dpsi2_dvar_gpu.get()-_dpsi2_dvariance).max() return _psi2, _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ, _dpsi2_dlengthscale diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index a840162d..22966448 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -11,6 +11,9 @@ from ...core.parameterization import variational from psi_comp import ssrbf_psi_comp from psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF +import pycuda.gpuarray as gpuarray +import pycuda.autoinit + class RBF(Stationary): """ Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel: @@ -26,8 +29,8 @@ class RBF(Stationary): self.weave_options = {} self.group_spike_prob = False -# if self.useGPU: -# self.psicomp = PSICOMP_SSRBF() + if self.useGPU: + self.psicomp = PSICOMP_SSRBF() def K_of_r(self, r): @@ -70,6 +73,13 @@ class RBF(Stationary): def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + dL_dpsi0_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi0)) + dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) + dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + self.psicomp.update_gradients_expectations(dL_dpsi0_gpu, dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) + vg = self.variance.gradient.copy() + lg = self.lengthscale.gradient.copy() + _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) @@ -89,6 +99,9 @@ class RBF(Stationary): self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) else: self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum() + + print np.abs(vg-self.variance.gradient) + print np.abs(lg-self.lengthscale.gradient) elif isinstance(variational_posterior, variational.NormalPosterior): l2 = self.lengthscale**2 diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index d2528a63..73d57e1f 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -25,5 +25,23 @@ try: # log(1.0-X) logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element") + + # multiplication with broadcast on the last dimension + mul_bcast = ElementwiseKernel("double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast") + + # sum through the middle dimension (size_2) of a 3D matrix (size_1, size_2, size_3) + sum_axis = ElementwiseKernel("double *out, double *in, int size_1, int size_2", "out[i] += sum_axis_element(in, size_1, size_2, i)", "sum_axis",preamble=""" + __device__ double sum_axis_element(double *in, int size_1, int size_2, int idx) + { + int k = idx/size_1; + int i = idx%size_1; + double sum=0; + for(int j=0;j Date: Tue, 1 Apr 2014 18:10:35 +0100 Subject: [PATCH 12/33] [GPU] bug fix --- GPy/inference/latent_function_inference/var_dtc_gpu.py | 5 +++-- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 5 ++--- GPy/util/linalg_gpu.py | 2 ++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 75a07992..e223af3c 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -15,7 +15,7 @@ try: from scikits.cuda import cublas import pycuda.autoinit from pycuda.reduction import ReductionKernel - from ...util.linalg_gpu import logDiagSum + from ...util.linalg_gpu import logDiagSum, strideSum except: pass @@ -212,7 +212,8 @@ class VarDTC_GPU(object): cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, LmInvPsi2LmInvT_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) - tr_LmInvPsi2LmInvT = cublas.cublasDasum(self.cublas_handle, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing+1) + #tr_LmInvPsi2LmInvT = cublas.cublasDasum(self.cublas_handle, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing+1) + tr_LmInvPsi2LmInvT = strideSum(LmInvPsi2LmInvT_gpu, num_inducing+1) print np.abs(vvt-vvt_gpu.get()).max() print np.abs(np.trace(LmInvPsi2LmInvT)-tr_LmInvPsi2LmInvT) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index b116d9cc..da948661 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -414,7 +414,7 @@ class PSICOMP_SSRBF(object): grad_dl_gpu = self.gpuCache['grad_l_gpu'] # variance - variance.gradient = cublas.cublasDasum(self.cublas_handle, dL_dpsi0.size, dL_dpsi0, 1) \ + variance.gradient = gpuarray.sum(dL_dpsi0) \ + cublas.cublasDdot(self.cublas_handle, dL_dpsi1.size, dL_dpsi1.gpudata, 1, dpsi1_dvar_gpu.gpudata, 1) \ + cublas.cublasDdot(self.cublas_handle, dL_dpsi2.size, dL_dpsi2.gpudata, 1, dpsi2_dvar_gpu.gpudata, 1) @@ -429,8 +429,7 @@ class PSICOMP_SSRBF(object): else: linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size) linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size) - lengthscale.gradient = cublas.cublasDasum(self.cublas_handle, psi1_comb_gpu.size, psi1_comb_gpu, 1) \ - + cublas.cublasDasum(self.cublas_handle, psi2_comb_gpu.size, psi2_comb_gpu, 1) + lengthscale.gradient = gpuarray.sum(psi1_comb_gpu) + gpuarray.sum(psi2_comb_gpu) def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): pass diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 73d57e1f..60eb8101 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -16,6 +16,8 @@ try: # logDiagSum(A, A.shape[0]+1) logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") + strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step") + #======================================================================================= # Element-wise functions #======================================================================================= From c20cd69c4ebea614867115c40371cbb386c8accb Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 2 Apr 2014 10:52:04 +0100 Subject: [PATCH 13/33] [GPU] bug fix --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 4 ++-- GPy/util/linalg_gpu.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index da948661..12c39e16 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -414,7 +414,7 @@ class PSICOMP_SSRBF(object): grad_dl_gpu = self.gpuCache['grad_l_gpu'] # variance - variance.gradient = gpuarray.sum(dL_dpsi0) \ + variance.gradient = gpuarray.sum(dL_dpsi0).get() \ + cublas.cublasDdot(self.cublas_handle, dL_dpsi1.size, dL_dpsi1.gpudata, 1, dpsi1_dvar_gpu.gpudata, 1) \ + cublas.cublasDdot(self.cublas_handle, dL_dpsi2.size, dL_dpsi2.gpudata, 1, dpsi2_dvar_gpu.gpudata, 1) @@ -429,7 +429,7 @@ class PSICOMP_SSRBF(object): else: linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size) linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size) - lengthscale.gradient = gpuarray.sum(psi1_comb_gpu) + gpuarray.sum(psi2_comb_gpu) + lengthscale.gradient = gpuarray.sum(psi1_comb_gpu).get() + gpuarray.sum(psi2_comb_gpu).get() def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): pass diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 60eb8101..6f5dc45b 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -28,8 +28,8 @@ try: # log(1.0-X) logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element") - # multiplication with broadcast on the last dimension - mul_bcast = ElementwiseKernel("double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast") + # multiplication with broadcast on the last dimension (a has to be smaller than b) + mul_bcast = ElementwiseKernel("double *out, double *a, double *b, int a_size", "out[i] = b[i]*a[i % a_size ]", "mul_bcast") # sum through the middle dimension (size_2) of a 3D matrix (size_1, size_2, size_3) sum_axis = ElementwiseKernel("double *out, double *in, int size_1, int size_2", "out[i] += sum_axis_element(in, size_1, size_2, i)", "sum_axis",preamble=""" From 73f690a4c94c07c79e211554e36f443ee69aafb2 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 2 Apr 2014 11:22:56 +0100 Subject: [PATCH 14/33] [GPU] bug fix --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 2 +- GPy/util/linalg_gpu.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 12c39e16..a21a15e3 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -410,7 +410,7 @@ class PSICOMP_SSRBF(object): dpsi1_dl_gpu = self.gpuCache['dpsi1_dl_gpu'] dpsi2_dl_gpu = self.gpuCache['dpsi2_dl_gpu'] psi1_comb_gpu = self.gpuCache['psi1_neq_gpu'] - psi2_comb_gpu = self.gpuCache['psi1_neq_gpu'] + psi2_comb_gpu = self.gpuCache['psi2_neq_gpu'] grad_dl_gpu = self.gpuCache['grad_l_gpu'] # variance diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 6f5dc45b..60eb8101 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -28,8 +28,8 @@ try: # log(1.0-X) logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element") - # multiplication with broadcast on the last dimension (a has to be smaller than b) - mul_bcast = ElementwiseKernel("double *out, double *a, double *b, int a_size", "out[i] = b[i]*a[i % a_size ]", "mul_bcast") + # multiplication with broadcast on the last dimension + mul_bcast = ElementwiseKernel("double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast") # sum through the middle dimension (size_2) of a 3D matrix (size_1, size_2, size_3) sum_axis = ElementwiseKernel("double *out, double *in, int size_1, int size_2", "out[i] += sum_axis_element(in, size_1, size_2, i)", "sum_axis",preamble=""" From b90a8672321864dbb6d9fbfbd96dfed30c459612 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 2 Apr 2014 11:43:32 +0100 Subject: [PATCH 15/33] [GPU] update gradients rest --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 76 ++++++++++++++++++--- GPy/kern/_src/rbf.py | 68 ++++++++++-------- 2 files changed, 106 insertions(+), 38 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index a21a15e3..d8c84df4 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -246,7 +246,7 @@ try: dpsi2_dgamma[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2exp1_c/denom_sqrt - psi2exp2_c); dpsi2_dmu[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(-2.0*psi2_common*muZ*psi2exp1_c); dpsi2_dS[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(2.0*muZ*muZ/(2.0*S_c+l_c)-1.0)*psi2exp1_c); - dpsi2_dZ[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(dZ*denom/-2.0+muZ)*psi2exp1_c-gamma1*Z1_c/l_c*psi2exp2_c)*2.0; + dpsi2_dZ[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(dZ*denom/-2.0+muZ)*psi2exp1_c-gamma1*Z2_c/l_c*psi2exp2_c)*2.0; return var2*neq*(psi2_common*(S_c/l_c+dZ*dZ*denom/(4.0*l_c)+muZ*muZ/(2.0*S_c+l_c))*psi2exp1_c+gamma1*Z2/(2.0*l_c)*psi2exp2_c)*l_sqrt_c*2.0; } """) @@ -411,7 +411,7 @@ class PSICOMP_SSRBF(object): dpsi2_dl_gpu = self.gpuCache['dpsi2_dl_gpu'] psi1_comb_gpu = self.gpuCache['psi1_neq_gpu'] psi2_comb_gpu = self.gpuCache['psi2_neq_gpu'] - grad_dl_gpu = self.gpuCache['grad_l_gpu'] + grad_l_gpu = self.gpuCache['grad_l_gpu'] # variance variance.gradient = gpuarray.sum(dL_dpsi0).get() \ @@ -420,22 +420,78 @@ class PSICOMP_SSRBF(object): # lengscale if ARD: - grad_dl_gpu.fill(0.) + grad_l_gpu.fill(0.) linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size) - linalg_gpu.sum_axis(grad_dl_gpu, psi1_comb_gpu, 1, N*M) + linalg_gpu.sum_axis(grad_l_gpu, psi1_comb_gpu, 1, N*M) linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size) - linalg_gpu.sum_axis(grad_dl_gpu, psi2_comb_gpu, 1, N*M*M) - lengthscale.gradient = grad_dl_gpu.get() + linalg_gpu.sum_axis(grad_l_gpu, psi2_comb_gpu, 1, N*M*M) + lengthscale.gradient = grad_l_gpu.get() else: linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size) linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size) lengthscale.gradient = gpuarray.sum(psi1_comb_gpu).get() + gpuarray.sum(psi2_comb_gpu).get() - def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): - pass + def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma) + N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1] + + dpsi1_dZ_gpu = self.gpuCache['dpsi1_dZ_gpu'] + dpsi2_dZ_gpu = self.gpuCache['dpsi2_dZ_gpu'] + psi1_comb_gpu = self.gpuCache['psi1_neq_gpu'] + psi2_comb_gpu = self.gpuCache['psi2_neq_gpu'] + grad_Z_gpu = self.gpuCache['grad_Z_gpu'] + + grad_Z_gpu.fill(0.) + linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dZ_gpu, dL_dpsi1.size) + linalg_gpu.sum_axis(grad_Z_gpu, psi1_comb_gpu, 1, N) + linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dZ_gpu, dL_dpsi2.size) + linalg_gpu.sum_axis(grad_Z_gpu, psi2_comb_gpu, 1, N*M) + return grad_Z_gpu.get() - def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, mu, S, gamma): - pass + def gradients_qX_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior): + mu = variational_posterior.mean + S = variational_posterior.variance + gamma = variational_posterior.binary_prob + self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma) + N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1] + + dpsi1_dmu_gpu = self.gpuCache['dpsi1_dmu_gpu'] + dpsi2_dmu_gpu = self.gpuCache['dpsi2_dmu_gpu'] + dpsi1_dS_gpu = self.gpuCache['dpsi1_dS_gpu'] + dpsi2_dS_gpu = self.gpuCache['dpsi2_dS_gpu'] + dpsi1_dgamma_gpu = self.gpuCache['dpsi1_dgamma_gpu'] + dpsi2_dgamma_gpu = self.gpuCache['dpsi2_dgamma_gpu'] + psi1_comb_gpu = self.gpuCache['psi1_neq_gpu'] + psi2_comb_gpu = self.gpuCache['psi2_neq_gpu'] + grad_mu_gpu = self.gpuCache['grad_mu_gpu'] + grad_S_gpu = self.gpuCache['grad_S_gpu'] + grad_gamma_gpu = self.gpuCache['grad_gamma_gpu'] + + # mu gradients + grad_mu_gpu.fill(0.) + linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dmu_gpu, dL_dpsi1.size) + linalg_gpu.sum_axis(grad_mu_gpu, psi1_comb_gpu, N, M) + linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dmu_gpu, dL_dpsi2.size) + linalg_gpu.sum_axis(grad_mu_gpu, psi2_comb_gpu, N, M*M) + + # S gradients + grad_S_gpu.fill(0.) + linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dS_gpu, dL_dpsi1.size) + linalg_gpu.sum_axis(grad_S_gpu, psi1_comb_gpu, N, M) + linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dS_gpu, dL_dpsi2.size) + linalg_gpu.sum_axis(grad_S_gpu, psi2_comb_gpu, N, M*M) + + # gamma gradients + grad_gamma_gpu.fill(0.) + linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dgamma_gpu, dL_dpsi1.size) + linalg_gpu.sum_axis(grad_gamma_gpu, psi1_comb_gpu, N, M) + linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dgamma_gpu, dL_dpsi2.size) + linalg_gpu.sum_axis(grad_gamma_gpu, psi2_comb_gpu, N, M*M) + + return grad_mu_gpu.get(), grad_S_gpu.get(), grad_gamma_gpu.get() @Cache_this(limit=1) def _Z_distances(Z): diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 22966448..39d36cf3 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -73,36 +73,33 @@ class RBF(Stationary): def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - dL_dpsi0_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi0)) - dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) - dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) - self.psicomp.update_gradients_expectations(dL_dpsi0_gpu, dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) - vg = self.variance.gradient.copy() - lg = self.lengthscale.gradient.copy() - - _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - - #contributions from psi0: - self.variance.gradient = np.sum(dL_dpsi0) - - #from psi1 - self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance) - if self.ARD: - self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) + if self.useGPU: + dL_dpsi0_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi0)) + dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) + dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + self.psicomp.update_gradients_expectations(dL_dpsi0_gpu, dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) else: - self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum() - - #from psi2 - self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum() - if self.ARD: - self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) - else: - self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum() - print np.abs(vg-self.variance.gradient) - print np.abs(lg-self.lengthscale.gradient) - + _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + + #contributions from psi0: + self.variance.gradient = np.sum(dL_dpsi0) + + #from psi1 + self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance) + if self.ARD: + self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) + else: + self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum() + + #from psi2 + self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum() + if self.ARD: + self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0) + else: + self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum() + elif isinstance(variational_posterior, variational.NormalPosterior): l2 = self.lengthscale**2 if l2.size != self.input_dim: @@ -141,6 +138,12 @@ class RBF(Stationary): def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) + dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + gZ = self.psicomp.gradients_Z_expectations(dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) + + + _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) @@ -150,6 +153,8 @@ class RBF(Stationary): #psi2 grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1) + print np.abs(gZ - grad).max() + return grad elif isinstance(variational_posterior, variational.NormalPosterior): @@ -174,6 +179,11 @@ class RBF(Stationary): def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) + dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + gmu,gS,gg = self.psicomp.gradients_qX_expectations(dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) + + ndata = variational_posterior.mean.shape[0] _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) @@ -191,6 +201,8 @@ class RBF(Stationary): if self.group_spike_prob: grad_gamma[:] = grad_gamma.mean(axis=0) + + print np.abs(gmu-grad_mu).max(),np.abs(gS-grad_S).max(),np.abs(gg-grad_gamma).max() return grad_mu, grad_S, grad_gamma From 24cc9c1bc360173042e3fa8bbda55be9f925e577 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 2 Apr 2014 11:48:27 +0100 Subject: [PATCH 16/33] [GPU] gradient check ready --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 2 +- GPy/kern/_src/rbf.py | 81 ++++++++++----------- 2 files changed, 39 insertions(+), 44 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index d8c84df4..2efa7a97 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -234,7 +234,7 @@ try: double psi2exp1_c = psi2exp1[IDX_NMMQ(n,m1,m2,q)]; double psi2exp2_c = psi2exp2[IDX_MMQ(m1,m2,q)]; - double dZ = Z1_c - Z2_c; + double dZ = Z2_c - Z1_c; double muZ = mu[IDX_NQ(n,q)] - (Z1_c+Z2_c)/2.0; double Z2 = Z1_c*Z1_c+Z2_c*Z2_c; double denom = 2.0*S_c/l_c+1.0; diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 39d36cf3..e5da3d97 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -138,24 +138,21 @@ class RBF(Stationary): def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) - dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) - gZ = self.psicomp.gradients_Z_expectations(dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) - - - - _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - - #psi1 - grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0) - - #psi2 - grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1) - - print np.abs(gZ - grad).max() - - return grad + if self.useGPU: + dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) + dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + return self.psicomp.gradients_Z_expectations(dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) + else: + _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + + #psi1 + grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0) + + #psi2 + grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1) + + return grad elif isinstance(variational_posterior, variational.NormalPosterior): l2 = self.lengthscale **2 @@ -179,32 +176,30 @@ class RBF(Stationary): def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) - dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) - gmu,gS,gg = self.psicomp.gradients_qX_expectations(dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) - - - ndata = variational_posterior.mean.shape[0] - - _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) - - #psi1 - grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1) - grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1) - grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1) - - #psi2 - grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1) - grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1) - grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1) - - if self.group_spike_prob: - grad_gamma[:] = grad_gamma.mean(axis=0) + if self.useGPU: + dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) + dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + return self.psicomp.gradients_qX_expectations(dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) + else: + ndata = variational_posterior.mean.shape[0] + + _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + + #psi1 + grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1) + grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1) + grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1) + + #psi2 + grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1) + grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1) + grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1) - print np.abs(gmu-grad_mu).max(),np.abs(gS-grad_S).max(),np.abs(gg-grad_gamma).max() - - return grad_mu, grad_S, grad_gamma + if self.group_spike_prob: + grad_gamma[:] = grad_gamma.mean(axis=0) + + return grad_mu, grad_S, grad_gamma elif isinstance(variational_posterior, variational.NormalPosterior): From f1d831c5f1ba738a1313eee32c5f50c0e0e2fdad Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 2 Apr 2014 11:56:29 +0100 Subject: [PATCH 17/33] [GPU] bug fix --- GPy/inference/latent_function_inference/var_dtc_gpu.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index e223af3c..9b36a9ab 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -213,7 +213,7 @@ class VarDTC_GPU(object): cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) #tr_LmInvPsi2LmInvT = cublas.cublasDasum(self.cublas_handle, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing+1) - tr_LmInvPsi2LmInvT = strideSum(LmInvPsi2LmInvT_gpu, num_inducing+1) + tr_LmInvPsi2LmInvT = float(strideSum(LmInvPsi2LmInvT_gpu, num_inducing+1).get()) print np.abs(vvt-vvt_gpu.get()).max() print np.abs(np.trace(LmInvPsi2LmInvT)-tr_LmInvPsi2LmInvT) @@ -255,8 +255,8 @@ class VarDTC_GPU(object): logL_R = -num_data*np.log(beta) logL_old = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) - logdetKmm = logDiagSum(Lm_gpu,num_inducing+1) - logdetLambda = logDiagSum(LL_gpu,num_inducing+1) + logdetKmm = float(logDiagSum(Lm_gpu,num_inducing+1).get()) + logdetLambda = float(logDiagSum(LL_gpu,num_inducing+1).get()) logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-tr_LmInvPsi2LmInvT)+YRY_full-bbt)/2.+output_dim*(logdetKmm-logdetLambda) print np.abs(logL_old - logL) From daf5a877f35e26f633065ff7270c3e5c6d408a58 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Thu, 3 Apr 2014 10:59:17 +0100 Subject: [PATCH 18/33] [GPU] vardtc_likelihood --- .../latent_function_inference/var_dtc_gpu.py | 183 ++++++++++++------ GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 183 ++---------------- 2 files changed, 142 insertions(+), 224 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 9b36a9ab..c18102e4 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -15,7 +15,7 @@ try: from scikits.cuda import cublas import pycuda.autoinit from pycuda.reduction import ReductionKernel - from ...util.linalg_gpu import logDiagSum, strideSum + from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis except: pass @@ -49,7 +49,7 @@ class VarDTC_GPU(object): # Initialize GPU caches self.gpuCache = None - def _initGPUCache(self, num_inducing, output_dim): + def _initGPUCache(self, num_inducing, output_dim, Y): if self.gpuCache == None: self.gpuCache = {# inference_likelihood 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), @@ -63,17 +63,19 @@ class VarDTC_GPU(object): 'KmmInvPsi2P_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'dL_dpsi2R_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'dL_dKmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'psi1Y_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'), + 'psi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'beta_gpu' :gpuarray.empty((output_dim,),np.float64,order='F'), + 'Y_gpu' :gpuarray.to_gpu(np.asfortranarray(Y)), + 'betaY_gpu' :gpuarray.empty(Y.shape,np.float64,order='F'), + 'psi2_t_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), # inference_minibatch } self.gpuCache['ones_gpu'].fill(1.0) - - def set_limit(self, limit): - self.get_trYYT.limit = limit - self.get_YYTfactor.limit = limit + + Y_gpu = self.gpuCache['Y_gpu'] + self._trYYT = cublas.cublasDdot(self.cublas_handle, Y_gpu.size, Y_gpu.gpudata, 1, Y_gpu.gpudata, 1) - def _get_trYYT(self, Y): - return param_to_array(np.sum(np.square(Y))) - def _get_YYTfactor(self, Y): """ find a matrix L which satisfies LLT = YYT. @@ -94,7 +96,7 @@ class VarDTC_GPU(object): Cached intermediate results: Kmm, KmmInv, """ - num_inducing = Z.shape[0] + num_inducing = Z.shape[0] num_data, output_dim = Y.shape self._initGPUCache(num_inducing, output_dim) @@ -107,59 +109,120 @@ class VarDTC_GPU(object): #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.variance, 1e-6) het_noise = beta.size > 1 - trYYT = self.get_trYYT(Y) + trYYT = self._trYYT + psi1Y_gpu = self.gpuCache['psi1Y_gpu'] + psi2_gpu = self.gpuCache['psi2_gpu'] + beta_gpu = self.gpuCache['beta_gpu'] + Y_gpu = self.gpuCache['Y_gpu'] + betaY_gpu = self.gpuCache['betaY_gpu'] + psi2_t_gpu = self.gpuCache['psi2_t_gpu'] - psi2_full = np.zeros((num_inducing,num_inducing)) - psi1Y_full = np.zeros((num_inducing,output_dim)) # DxM - psi0_full = 0 - YRY_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - - n_end = min(self.batchsize+n_start, num_data) - - Y_slice = Y[n_start:n_end] - X_slice = X[n_start:n_end] - - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - psi2 = None - - if het_noise: - beta_slice = beta[n_start:n_end] - psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD - YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() - else: - psi0_full += psi0.sum() - psi1Y_full += np.dot(psi1.T,Y_slice) # MxD - - - if uncertain_inputs: - if het_noise: - psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2) - else: - psi2_full += psi2.sum(axis=0) - else: - if het_noise: - psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1) - else: - psi2_full += tdot(psi1.T) - - if not het_noise: - psi0_full *= beta - psi1Y_full *= beta - psi2_full *= beta + if het_noise: + beta_gpu.set(np.asfortranarray(beta)) + mul_bcast(betaY_gpu,beta_gpu,Y_gpu,beta_gpu.size) + YRY_full = cublas.cublasDdot(self.cublas_handle, Y_gpu.size, betaY_gpu.gpudata, 1, Y_gpu.gpudata, 1) + else: + beta_gpu.fill(beta) + betaY_gpu.fill(0.) + cublas.cublasDaxpy(self.cublas_handle, betaY_gpu.size, beta, Y_gpu.gpudata, 1, betaY_gpu, 1) YRY_full = trYYT*beta - - psi1Y_gpu = gpuarray.to_gpu(np.asfortranarray(psi1Y_full)) - psi2_gpu = gpuarray.to_gpu(np.asfortranarray(psi2_full)) + + if kern.useGPU: + psi1Y_gpu.fill(0.) + psi2_gpu.fill(0.) + psi0_full = 0 + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + ndata = n_end - n_start + Y_slice = Y[n_start:n_end] + X_slice = X[n_start:n_end] + beta_gpu_slice = beta_gpu[n_start:n_end] + betaY_gpu_slice = betaY_gpu[n_start:n_end] + if ndata==self.batchsize: + psi2_t_gpu_slice = psi2_t_gpu + else: + psi2_t_gpu_slice = psi2_t_gpu[0:ndata] + if uncertain_inputs: + psi0p_gpu = kern.psi0(Z, X_slice) + psi1p_gpu = kern.psi1(Z, X_slice) + psi2p_gpu = kern.psi2(Z, X_slice) + else: + psi0p_gpu = kern.Kdiag(X_slice) + psi1p_gpu = kern.K(X_slice, Z) + + cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaY_gpu_slice.gpudata, ndata, 1.0, psi1Y_gpu.gpudata, num_inducing) + if het_noise: + psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1) + else: + psi0_full += gpuarray.sum(psi0p_gpu).get() + + if uncertain_inputs: + if het_noise: + mul_bcast(psi2_t_gpu_slice,beta_gpu_slice,psi2p_gpu,beta_gpu_slice.size) + sum_axis(psi2_gpu,psi2_t_gpu_slice,1,ndata) + else: + sum_axis(psi2_gpu,psi2p_gpu,1,ndata) + else: + if het_noise: + psi1_t_gpu = psi2_t_gpu_slice[:,:,0] + mul_bcast(psi1_t_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) + cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, 1.0, psi1p_gpu.gpudata, ndata, psi1_t_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) + else: + cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) + + if not het_noise: + psi0_full *= beta + if uncertain_inputs: + cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1) + + else: + psi2_full = np.zeros((num_inducing,num_inducing),order='F') + psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD + psi0_full = 0 + YRY_full = 0 + + for n_start in xrange(0,num_data,self.batchsize): + n_end = min(self.batchsize+n_start, num_data) + Y_slice = Y[n_start:n_end] + X_slice = X[n_start:n_end] + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2 = kern.psi2(Z, X_slice) + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) + + if het_noise: + beta_slice = beta[n_start:n_end] + psi0_full += (beta_slice*psi0).sum() + psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD + YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() + else: + psi0_full += psi0.sum() + psi1Y_full += np.dot(psi1.T,Y_slice) # MxD + + if uncertain_inputs: + if het_noise: + psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2) + else: + psi2_full += psi2.sum(axis=0) + else: + if het_noise: + psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1) + else: + psi2_full += tdot(psi1.T) + + if not het_noise: + psi0_full *= beta + psi1Y_full *= beta + psi2_full *= beta + YRY_full = trYYT*beta + + psi1Y_gpu.set(psi1Y_full) + psi2_gpu.set(psi2_full) #====================================================================== # Compute Common Components diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 2efa7a97..bca9d6ee 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -260,8 +260,11 @@ class PSICOMP_SSRBF(object): self.gpuCache = None def _initGPUCache(self, N, M, Q): + if self.gpuCache and self.gpuCacheAll['mu_gpu'].shape[0]N: + self.gpuCache = self.gpuCacheAll.copy() + for k in self._gpuCache_Nlist: + self.gpuCache[k] = self.gpuCacheAll[k][0:N] + + def _releaseMemory(self): + if not self.gpuCacheAll: + for k,v in self.gpuCacheAll: + v.gpudata.free() + del v + self.gpuCacheAll = None + self.gpuCache = None def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): """Compute Psi statitsitcs""" @@ -492,166 +510,3 @@ class PSICOMP_SSRBF(object): linalg_gpu.sum_axis(grad_gamma_gpu, psi2_comb_gpu, N, M*M) return grad_mu_gpu.get(), grad_S_gpu.get(), grad_gamma_gpu.get() - -@Cache_this(limit=1) -def _Z_distances(Z): - Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q - Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q - return Zhat, Zdist - -def _psicomputations(variance, lengthscale, Z, mu, S, gamma): - """ - """ - - -@Cache_this(limit=1) -def _psi1computations(variance, lengthscale, Z, mu, S, gamma): - """ - Z - MxQ - mu - NxQ - S - NxQ - gamma - NxQ - """ - # here are the "statistics" for psi1 and psi2 - # Produced intermediate results: - # _psi1 NxM - # _dpsi1_dvariance NxM - # _dpsi1_dlengthscale NxMxQ - # _dpsi1_dZ NxMxQ - # _dpsi1_dgamma NxMxQ - # _dpsi1_dmu NxMxQ - # _dpsi1_dS NxMxQ - - lengthscale2 = np.square(lengthscale) - - # psi1 - _psi1_denom = S[:, None, :] / lengthscale2 + 1. # Nx1xQ - _psi1_denom_sqrt = np.sqrt(_psi1_denom) #Nx1xQ - _psi1_dist = Z[None, :, :] - mu[:, None, :] # NxMxQ - _psi1_dist_sq = np.square(_psi1_dist) / (lengthscale2 * _psi1_denom) # NxMxQ - _psi1_common = gamma[:,None,:] / (lengthscale2*_psi1_denom*_psi1_denom_sqrt) #Nx1xQ - _psi1_exponent1 = np.log(gamma[:,None,:]) -0.5 * (_psi1_dist_sq + np.log(_psi1_denom)) # NxMxQ - _psi1_exponent2 = np.log(1.-gamma[:,None,:]) -0.5 * (np.square(Z[None,:,:])/lengthscale2) # NxMxQ - _psi1_exponent_max = np.maximum(_psi1_exponent1,_psi1_exponent2) - _psi1_exponent = _psi1_exponent_max+np.log(np.exp(_psi1_exponent1-_psi1_exponent_max) + np.exp(_psi1_exponent2-_psi1_exponent_max)) #NxMxQ - _psi1_exp_sum = _psi1_exponent.sum(axis=-1) #NxM - _psi1_exp_dist_sq = np.exp(-0.5*_psi1_dist_sq) # NxMxQ - _psi1_exp_Z = np.exp(-0.5*np.square(Z[None,:,:])/lengthscale2) # 1xMxQ - _psi1_q = variance * np.exp(_psi1_exp_sum[:,:,None] - _psi1_exponent) # NxMxQ - _psi1 = variance * np.exp(_psi1_exp_sum) # NxM - _dpsi1_dvariance = _psi1 / variance # NxM - _dpsi1_dgamma = _psi1_q * (_psi1_exp_dist_sq/_psi1_denom_sqrt-_psi1_exp_Z) # NxMxQ - _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ - _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ - _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ - _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ - - N = mu.shape[0] - M = Z.shape[0] - Q = mu.shape[1] - - l_gpu = gpuarray.empty((Q,),np.float64, order='F') - l_gpu.fill(lengthscale2) - Z_gpu = gpuarray.to_gpu(np.asfortranarray(Z)) - mu_gpu = gpuarray.to_gpu(np.asfortranarray(mu)) - S_gpu = gpuarray.to_gpu(np.asfortranarray(S)) - gamma_gpu = gpuarray.to_gpu(np.asfortranarray(gamma)) - logGamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(gamma))) - log1Gamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(1.-gamma))) - logpsi1denom_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(S/lengthscale2+1.))) - psi1_gpu = gpuarray.empty((mu.shape[0],Z.shape[0]),np.float64, order='F') - psi1_neq_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - psi1exp1_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - psi1exp2_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - dpsi1_dvar_gpu = gpuarray.empty((N,M),np.float64, order='F') - dpsi1_dl_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - dpsi1_dZ_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - dpsi1_dgamma_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - dpsi1_dmu_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - dpsi1_dS_gpu = gpuarray.empty((N,M,Q),np.float64, order='F') - - comp_dpsi1_dvar(dpsi1_dvar_gpu,psi1_neq_gpu,psi1exp1_gpu,psi1exp2_gpu, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) - comp_psi1_der(dpsi1_dl_gpu,dpsi1_dmu_gpu,dpsi1_dS_gpu,dpsi1_dgamma_gpu, dpsi1_dZ_gpu, psi1_neq_gpu,psi1exp1_gpu,psi1exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q) - -# print np.abs(dpsi1_dmu_gpu.get()-_dpsi1_dmu).max() - - return _psi1, _dpsi1_dvariance, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _dpsi1_dZ, _dpsi1_dlengthscale - -@Cache_this(limit=1) -def _psi2computations(variance, lengthscale, Z, mu, S, gamma): - """ - Z - MxQ - mu - NxQ - S - NxQ - gamma - NxQ - """ - # here are the "statistics" for psi1 and psi2 - # Produced intermediate results: - # _psi2 NxMxM - # _psi2_dvariance NxMxM - # _psi2_dlengthscale NxMxMxQ - # _psi2_dZ NxMxMxQ - # _psi2_dgamma NxMxMxQ - # _psi2_dmu NxMxMxQ - # _psi2_dS NxMxMxQ - - lengthscale2 = np.square(lengthscale) - - _psi2_Zhat, _psi2_Zdist = _Z_distances(Z) - _psi2_Zdist_sq = np.square(_psi2_Zdist / lengthscale) # M,M,Q - _psi2_Z_sq_sum = (np.square(Z[:,None,:])+np.square(Z[None,:,:]))/lengthscale2 # MxMxQ - - # psi2 - _psi2_denom = 2.*S[:, None, None, :] / lengthscale2 + 1. # Nx1x1xQ - _psi2_denom_sqrt = np.sqrt(_psi2_denom) - _psi2_mudist = mu[:,None,None,:]-_psi2_Zhat #N,M,M,Q - _psi2_mudist_sq = np.square(_psi2_mudist)/(lengthscale2*_psi2_denom) - _psi2_common = gamma[:,None,None,:]/(lengthscale2 * _psi2_denom * _psi2_denom_sqrt) # Nx1x1xQ - _psi2_exponent1 = -_psi2_Zdist_sq -_psi2_mudist_sq -0.5*np.log(_psi2_denom)+np.log(gamma[:,None,None,:]) #N,M,M,Q - _psi2_exponent2 = np.log(1.-gamma[:,None,None,:]) - 0.5*(_psi2_Z_sq_sum) # NxMxMxQ - _psi2_exponent_max = np.maximum(_psi2_exponent1, _psi2_exponent2) - _psi2_exponent = _psi2_exponent_max+np.log(np.exp(_psi2_exponent1-_psi2_exponent_max) + np.exp(_psi2_exponent2-_psi2_exponent_max)) - _psi2_exp_sum = _psi2_exponent.sum(axis=-1) #NxM - _psi2_q = np.square(variance) * np.exp(_psi2_exp_sum[:,:,:,None]-_psi2_exponent) # NxMxMxQ - _psi2_exp_dist_sq = np.exp(-_psi2_Zdist_sq -_psi2_mudist_sq) # NxMxMxQ - _psi2_exp_Z = np.exp(-0.5*_psi2_Z_sq_sum) # MxMxQ - _psi2 = np.square(variance) * np.exp(_psi2_exp_sum) # N,M,M - _dpsi2_dvariance = 2. * _psi2/variance # NxMxM - _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ - _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ - _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ - _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ - _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ - - N = mu.shape[0] - M = Z.shape[0] - Q = mu.shape[1] - - l_gpu = gpuarray.empty((Q,),np.float64, order='F') - l_gpu.fill(lengthscale2) - Z_gpu = gpuarray.to_gpu(np.asfortranarray(Z)) - mu_gpu = gpuarray.to_gpu(np.asfortranarray(mu)) - S_gpu = gpuarray.to_gpu(np.asfortranarray(S)) - gamma_gpu = gpuarray.to_gpu(np.asfortranarray(gamma)) - logGamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(gamma))) - log1Gamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(1.-gamma))) - logpsi2denom_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(2.*S/lengthscale2+1.))) - psi2_gpu = gpuarray.empty((mu.shape[0],Z.shape[0],Z.shape[0]),np.float64, order='F') - psi2_neq_gpu = gpuarray.empty((N,M,M,Q),np.float64, order='F') - psi2exp1_gpu = gpuarray.empty((N,M,M,Q),np.float64, order='F') - psi2exp2_gpu = gpuarray.empty((M,M,Q),np.float64, order='F') - dpsi2_dvar_gpu = gpuarray.empty((N,M,M),np.float64, order='F') - dpsi2_dl_gpu = gpuarray.empty((N,M,M,Q),np.float64, order='F') - dpsi2_dZ_gpu = gpuarray.empty((N,M,M,Q),np.float64, order='F') - dpsi2_dgamma_gpu = gpuarray.empty((N,M,M,Q),np.float64, order='F') - dpsi2_dmu_gpu = gpuarray.empty((N,M,M,Q),np.float64, order='F') - dpsi2_dS_gpu = gpuarray.empty((N,M,M,Q),np.float64, order='F') - - #comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) - - comp_dpsi2_dvar(dpsi2_dvar_gpu,psi2_neq_gpu,psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) - comp_psi2_der(dpsi2_dl_gpu,dpsi2_dmu_gpu,dpsi2_dS_gpu,dpsi2_dgamma_gpu, dpsi2_dZ_gpu, psi2_neq_gpu,psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q) - -# print np.abs(dpsi2_dvar_gpu.get()-_dpsi2_dvariance).max() - - return _psi2, _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ, _dpsi2_dlengthscale From f07f66f1f7e3c97c112e32293dca853c5780e1ff Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Thu, 3 Apr 2014 11:07:54 +0100 Subject: [PATCH 19/33] [GPU] vardtc_likelihood 1 --- .../latent_function_inference/var_dtc_gpu.py | 67 +++++++++---------- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index c18102e4..c7e5c18a 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -33,12 +33,7 @@ class VarDTC_GPU(object): def __init__(self, batchsize, limit=1): self.batchsize = batchsize - - # Cache functions - from ...util.caching import Cacher - self.get_trYYT = Cacher(self._get_trYYT, limit) - self.get_YYTfactor = Cacher(self._get_YYTfactor, limit) - + self.midRes = {} self.batch_pos = 0 # the starting position of the current mini-batch @@ -99,7 +94,7 @@ class VarDTC_GPU(object): num_inducing = Z.shape[0] num_data, output_dim = Y.shape - self._initGPUCache(num_inducing, output_dim) + self._initGPUCache(num_inducing, output_dim, Y) if isinstance(X, VariationalPosterior): uncertain_inputs = True @@ -125,7 +120,7 @@ class VarDTC_GPU(object): else: beta_gpu.fill(beta) betaY_gpu.fill(0.) - cublas.cublasDaxpy(self.cublas_handle, betaY_gpu.size, beta, Y_gpu.gpudata, 1, betaY_gpu, 1) + cublas.cublasDaxpy(self.cublas_handle, betaY_gpu.size, beta, Y_gpu.gpudata, 1, betaY_gpu.gpudata, 1) YRY_full = trYYT*beta if kern.useGPU: @@ -234,37 +229,37 @@ class VarDTC_GPU(object): diag.add(Kmm, self.const_jitter) ones_gpu = self.gpuCache['ones_gpu'] cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1) - assert np.allclose(Kmm, Kmm_gpu.get()) +# assert np.allclose(Kmm, Kmm_gpu.get()) - Lm = jitchol(Kmm) +# Lm = jitchol(Kmm) # Lm_gpu = self.gpuCache['Lm_gpu'] cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lm_gpu.gpudata, 1) culinalg.cho_factor(Lm_gpu,'L') - print np.abs(np.tril(Lm)-np.tril(Lm_gpu.get())).max() +# print np.abs(np.tril(Lm)-np.tril(Lm_gpu.get())).max() - Lambda = Kmm+psi2_full - LL = jitchol(Lambda) +# Lambda = Kmm+psi2_full +# LL = jitchol(Lambda) # Lambda_gpu = self.gpuCache['LL_gpu'] cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lambda_gpu.gpudata, 1) cublas.cublasDaxpy(self.cublas_handle, psi2_gpu.size, np.float64(1.0), psi2_gpu.gpudata, 1, Lambda_gpu.gpudata, 1) LL_gpu = Lambda_gpu culinalg.cho_factor(LL_gpu,'L') - print np.abs(np.tril(LL)-np.tril(LL_gpu.get())).max() +# print np.abs(np.tril(LL)-np.tril(LL_gpu.get())).max() - b,_ = dtrtrs(LL, psi1Y_full) - bbt_cpu = np.square(b).sum() +# b,_ = dtrtrs(LL, psi1Y_full) +# bbt_cpu = np.square(b).sum() # b_gpu = self.gpuCache['b_gpu'] cublas.cublasDcopy(self.cublas_handle, b_gpu.size, psi1Y_gpu.gpudata, 1, b_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, b_gpu.gpudata, num_inducing) bbt = cublas.cublasDdot(self.cublas_handle, b_gpu.size, b_gpu.gpudata, 1, b_gpu.gpudata, 1) - print np.abs(bbt-bbt_cpu) +# print np.abs(bbt-bbt_cpu) - v,_ = dtrtrs(LL.T,b,lower=False) - vvt = np.einsum('md,od->mo',v,v) - LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') +# v,_ = dtrtrs(LL.T,b,lower=False) +# vvt = np.einsum('md,od->mo',v,v) +# LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') # v_gpu = self.gpuCache['v_gpu'] cublas.cublasDcopy(self.cublas_handle, v_gpu.size, b_gpu.gpudata, 1, v_gpu.gpudata, 1) @@ -277,13 +272,13 @@ class VarDTC_GPU(object): cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) #tr_LmInvPsi2LmInvT = cublas.cublasDasum(self.cublas_handle, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing+1) tr_LmInvPsi2LmInvT = float(strideSum(LmInvPsi2LmInvT_gpu, num_inducing+1).get()) - print np.abs(vvt-vvt_gpu.get()).max() - print np.abs(np.trace(LmInvPsi2LmInvT)-tr_LmInvPsi2LmInvT) +# print np.abs(vvt-vvt_gpu.get()).max() +# print np.abs(np.trace(LmInvPsi2LmInvT)-tr_LmInvPsi2LmInvT) - Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T - LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0] - KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0] - KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T +# Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T +# LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0] +# KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0] +# KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T # KmmInvPsi2LLInvT_gpu = LmInvPsi2LmInvT_gpu # Reuse GPU memory (size:MxM) cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, KmmInvPsi2LLInvT_gpu.gpudata, 1) @@ -293,19 +288,19 @@ class VarDTC_GPU(object): KmmInvPsi2P_gpu = self.gpuCache['KmmInvPsi2P_gpu'] cublas.cublasDcopy(self.cublas_handle, KmmInvPsi2LLInvT_gpu.size, KmmInvPsi2LLInvT_gpu.gpudata, 1, KmmInvPsi2P_gpu.gpudata, 1) cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2P_gpu.gpudata, num_inducing) - print np.abs(KmmInvPsi2P-KmmInvPsi2P_gpu.get()).max() +# print np.abs(KmmInvPsi2P-KmmInvPsi2P_gpu.get()).max() - dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2 +# dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2 # dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] cublas.cublasDcopy(self.cublas_handle, vvt_gpu.size, vvt_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1) - print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() +# print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() # Cache intermediate results - self.midRes['dL_dpsi2R'] = dL_dpsi2R - self.midRes['v'] = v + self.midRes['dL_dpsi2R'] = dL_dpsi2R_gpu.get() + self.midRes['v'] = v_gpu.get() #logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") @@ -316,24 +311,24 @@ class VarDTC_GPU(object): logL_R = -np.log(beta).sum() else: logL_R = -num_data*np.log(beta) - logL_old = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) +# logL_old = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) logdetKmm = float(logDiagSum(Lm_gpu,num_inducing+1).get()) logdetLambda = float(logDiagSum(LL_gpu,num_inducing+1).get()) logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-tr_LmInvPsi2LmInvT)+YRY_full-bbt)/2.+output_dim*(logdetKmm-logdetLambda) - print np.abs(logL_old - logL) +# print np.abs(logL_old - logL) #====================================================================== # Compute dL_dKmm #====================================================================== - dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2. +# dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2. # dL_dKmm_gpu = self.gpuCache['dL_dKmm_gpu'] cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, num_inducing, np.float64(1.0), KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, np.float64(0.), dL_dKmm_gpu.gpudata, num_inducing) cublas.cublasDaxpy(self.cublas_handle, dL_dKmm_gpu.size, np.float64(1./output_dim), vvt_gpu.gpudata, 1, dL_dKmm_gpu.gpudata, 1) cublas.cublasDscal(self.cublas_handle, dL_dKmm_gpu.size, np.float64(-output_dim/2.), dL_dKmm_gpu.gpudata, 1) - print np.abs(dL_dKmm - dL_dKmm_gpu.get()).max() +# print np.abs(dL_dKmm - dL_dKmm_gpu.get()).max() #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) @@ -341,7 +336,7 @@ class VarDTC_GPU(object): post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get()) - return logL, dL_dKmm, post + return logL, dL_dKmm_gpu.get(), post def inference_minibatch(self, kern, X, Z, likelihood, Y): """ From bb5c41f64cf2109fb8203041ef0c19a59194a87f Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Thu, 3 Apr 2014 12:27:56 +0100 Subject: [PATCH 20/33] [GPU] bug fix --- .../latent_function_inference/var_dtc_gpu.py | 31 +++++++++++-------- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 12 ++----- GPy/util/linalg_gpu.py | 2 +- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index c7e5c18a..59cf2b0a 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -61,15 +61,15 @@ class VarDTC_GPU(object): 'psi1Y_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'), 'psi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'beta_gpu' :gpuarray.empty((output_dim,),np.float64,order='F'), - 'Y_gpu' :gpuarray.to_gpu(np.asfortranarray(Y)), - 'betaY_gpu' :gpuarray.empty(Y.shape,np.float64,order='F'), + 'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y).T), # DxN + 'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN 'psi2_t_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), # inference_minibatch } self.gpuCache['ones_gpu'].fill(1.0) - Y_gpu = self.gpuCache['Y_gpu'] - self._trYYT = cublas.cublasDdot(self.cublas_handle, Y_gpu.size, Y_gpu.gpudata, 1, Y_gpu.gpudata, 1) + YT_gpu = self.gpuCache['YT_gpu'] + self._trYYT = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, YT_gpu.gpudata, 1, YT_gpu.gpudata, 1) def _get_YYTfactor(self, Y): """ @@ -109,32 +109,32 @@ class VarDTC_GPU(object): psi1Y_gpu = self.gpuCache['psi1Y_gpu'] psi2_gpu = self.gpuCache['psi2_gpu'] beta_gpu = self.gpuCache['beta_gpu'] - Y_gpu = self.gpuCache['Y_gpu'] - betaY_gpu = self.gpuCache['betaY_gpu'] + YT_gpu = self.gpuCache['YT_gpu'] + betaYT_gpu = self.gpuCache['betaYT_gpu'] psi2_t_gpu = self.gpuCache['psi2_t_gpu'] if het_noise: beta_gpu.set(np.asfortranarray(beta)) - mul_bcast(betaY_gpu,beta_gpu,Y_gpu,beta_gpu.size) - YRY_full = cublas.cublasDdot(self.cublas_handle, Y_gpu.size, betaY_gpu.gpudata, 1, Y_gpu.gpudata, 1) + mul_bcast(betaYT_gpu,beta_gpu,YT_gpu,beta_gpu.size) + YRY_full = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, betaYT_gpu.gpudata, 1, YT_gpu.gpudata, 1) else: beta_gpu.fill(beta) - betaY_gpu.fill(0.) - cublas.cublasDaxpy(self.cublas_handle, betaY_gpu.size, beta, Y_gpu.gpudata, 1, betaY_gpu.gpudata, 1) + betaYT_gpu.fill(0.) + cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1) YRY_full = trYYT*beta if kern.useGPU: psi1Y_gpu.fill(0.) psi2_gpu.fill(0.) psi0_full = 0 + psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD for n_start in xrange(0,num_data,self.batchsize): n_end = min(self.batchsize+n_start, num_data) ndata = n_end - n_start - Y_slice = Y[n_start:n_end] X_slice = X[n_start:n_end] beta_gpu_slice = beta_gpu[n_start:n_end] - betaY_gpu_slice = betaY_gpu[n_start:n_end] + betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] if ndata==self.batchsize: psi2_t_gpu_slice = psi2_t_gpu else: @@ -147,7 +147,12 @@ class VarDTC_GPU(object): psi0p_gpu = kern.Kdiag(X_slice) psi1p_gpu = kern.K(X_slice, Z) - cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaY_gpu_slice.gpudata, ndata, 1.0, psi1Y_gpu.gpudata, num_inducing) + cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) + psi1Y_full += np.dot(psi1p_gpu.get().T,Y_slice)*beta # MxD +# print psi1Y_gpu.get() +# print psi1Y_full + print np.abs(psi1Y_gpu.get()-psi1Y_full).max() + if het_noise: psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1) else: diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index bca9d6ee..0aebf399 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -260,7 +260,7 @@ class PSICOMP_SSRBF(object): self.gpuCache = None def _initGPUCache(self, N, M, Q): - if self.gpuCache and self.gpuCacheAll['mu_gpu'].shape[0]N: - self.gpuCache = self.gpuCacheAll.copy() - for k in self._gpuCache_Nlist: - self.gpuCache[k] = self.gpuCacheAll[k][0:N] def _releaseMemory(self): if not self.gpuCacheAll: @@ -361,7 +354,8 @@ class PSICOMP_SSRBF(object): comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) - return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get() +# return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get() + return psi0_gpu, psi1_gpu, psi2_gpu def _psiDercomputations(self, variance, lengthscale, Z, mu, S, gamma): """Compute the derivatives w.r.t. Psi statistics""" diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 60eb8101..039b0d62 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -28,7 +28,7 @@ try: # log(1.0-X) logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element") - # multiplication with broadcast on the last dimension + # multiplication with broadcast on the last dimension (out = shorter[:,None]*longer) mul_bcast = ElementwiseKernel("double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast") # sum through the middle dimension (size_2) of a 3D matrix (size_1, size_2, size_3) From 8c4507d9f135ee257b2df8f23eb4af18caf85681 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Thu, 3 Apr 2014 12:29:44 +0100 Subject: [PATCH 21/33] [GPU] bug fix --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 0aebf399..263884dd 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -260,11 +260,11 @@ class PSICOMP_SSRBF(object): self.gpuCache = None def _initGPUCache(self, N, M, Q): - if self.gpuCache and self.gpuCacheAll['mu_gpu'].shape[0]!=N: + if self.gpuCache and self.gpuCache['mu_gpu'].shape[0]!=N: self._releaseMemory() if self.gpuCache == None: - self.gpuCacheAll = { + self.gpuCache = { 'l_gpu' :gpuarray.empty((Q,),np.float64,order='F'), 'Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'), 'mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), @@ -306,11 +306,11 @@ class PSICOMP_SSRBF(object): } def _releaseMemory(self): - if not self.gpuCacheAll: - for k,v in self.gpuCacheAll: + if not self.gpuCache: + for k,v in self.gpuCache: v.gpudata.free() del v - self.gpuCacheAll = None + del self.gpuCache self.gpuCache = None def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): From 22e4f8a1e83947c89f573db4715b7641ebe82d61 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Fri, 4 Apr 2014 13:14:07 +0100 Subject: [PATCH 22/33] not importable --- GPy/likelihoods/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py index 5d5d692a..d7ad5753 100644 --- a/GPy/likelihoods/__init__.py +++ b/GPy/likelihoods/__init__.py @@ -15,8 +15,8 @@ except ImportError: if sympy_available: # These are likelihoods that rely on symbolic. from symbolic import Symbolic - from sstudent_t import SstudentT + #from sstudent_t import SstudentT from negative_binomial import Negative_binomial - from skew_normal import Skew_normal - from skew_exponential import Skew_exponential - from null_category import Null_category + #from skew_normal import Skew_normal + #from skew_exponential import Skew_exponential + #from null_category import Null_category From 04a889b3a924c60e1d04ef304f12f5b668ffe86b Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Fri, 4 Apr 2014 13:47:02 +0100 Subject: [PATCH 23/33] making observables accessable --- GPy/core/__init__.py | 1 + GPy/core/parameterization/lists_and_dicts.py | 66 ++++++++----------- GPy/core/parameterization/observable_array.py | 2 +- GPy/core/parameterization/param.py | 2 +- GPy/core/parameterization/parameter_core.py | 27 +++----- GPy/core/parameterization/parameterized.py | 2 +- GPy/testing/pickle_tests.py | 4 +- 7 files changed, 45 insertions(+), 59 deletions(-) diff --git a/GPy/core/__init__.py b/GPy/core/__init__.py index a42d76ed..25651827 100644 --- a/GPy/core/__init__.py +++ b/GPy/core/__init__.py @@ -4,6 +4,7 @@ from model import * from parameterization.parameterized import adjust_name_for_printing, Parameterizable from parameterization.param import Param, ParamConcatenation +from parameterization.observable_array import ObsAr from gp import GP from sparse_gp import SparseGP diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py index 6902c249..dd93c5ba 100644 --- a/GPy/core/parameterization/lists_and_dicts.py +++ b/GPy/core/parameterization/lists_and_dicts.py @@ -5,6 +5,7 @@ Created on 27 Feb 2014 ''' from collections import defaultdict +import weakref def intarray_default_factory(): import numpy as np @@ -41,49 +42,40 @@ class ObservablesList(object): def __init__(self): self._poc = [] - def remove(self, value): - return self._poc.remove(value) - - - def __delitem__(self, ind): - return self._poc.__delitem__(ind) - - - def __setitem__(self, ind, item): - return self._poc.__setitem__(ind, item) - - - def __getitem__(self, ind): - return self._poc.__getitem__(ind) - + def remove(self, priority, observable, callble): + """ + """ + self._poc.remove((priority, observable, callble)) def __repr__(self): return self._poc.__repr__() - - def append(self, obj): - return self._poc.append(obj) - - - def index(self, value): - return self._poc.index(value) - - - def extend(self, iterable): - return self._poc.extend(iterable) - - + def add(self, priority, observable, callble): + i = 0 + for i, [p, _, _] in enumerate(self._poc): + if p < priority: + break + self._poc.insert(i, (priority, weakref.ref(observable), callble)) + def __str__(self): - return self._poc.__str__() - + ret = [] + curr_p = None + for p, o, c in self: + curr = '' + if curr_p != p: + pre = "{!s}: ".format(p) + curr_pre = pre + else: curr_pre = " "*len(pre) + curr_p = p + curr += curr_pre + ret.append(curr + ", ".join(map(str, [o,c]))) + return '\n'.join(ret) def __iter__(self): - return self._poc.__iter__() - - - def insert(self, index, obj): - return self._poc.insert(index, obj) - + self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None] + for p, o, c in self._poc: + if o() is not None: + yield p, o(), c def __len__(self): return self._poc.__len__() @@ -106,6 +98,6 @@ class ObservablesList(object): def __setstate__(self, state): self._poc = [] for p, o, c in state: - self._poc.append((p,o,getattr(o, c))) + self.add(p,o,getattr(o, c)) pass diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py index fc9d6cf2..56d33bfc 100644 --- a/GPy/core/parameterization/observable_array.py +++ b/GPy/core/parameterization/observable_array.py @@ -25,7 +25,7 @@ class ObsAr(np.ndarray, Pickleable, Observable): def __array_finalize__(self, obj): # see InfoArray.__array_finalize__ for comments if obj is None: return - self._observer_callables_ = getattr(obj, '_observer_callables_', None) + self.observers = getattr(obj, 'observers', None) def __array_wrap__(self, out_arr, context=None): return out_arr.view(np.ndarray) diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 60bdfe9d..4490a8ee 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -59,7 +59,7 @@ class Param(OptimizationHandlable, ObsAr): import pydot node = pydot.Node(id(self), shape='record', label=self.name) G.add_node(node) - for o in self._observer_callables_.keys(): + for o in self.observers.keys(): label = o.name if hasattr(o, 'name') else str(o) observed_node = pydot.Node(id(o), label=label) G.add_node(observed_node) diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 2dac9bf3..43bc7177 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -44,22 +44,23 @@ class Observable(object): def __init__(self, *args, **kwargs): super(Observable, self).__init__() from lists_and_dicts import ObservablesList - self._observer_callables_ = ObservablesList() + self.observers = ObservablesList() def add_observer(self, observer, callble, priority=0): - self._insert_sorted(priority, observer, callble) + self.observers.add(priority, observer, callble) def remove_observer(self, observer, callble=None): to_remove = [] - for p, obs, clble in self._observer_callables_: + for poc in self.observers: + _, obs, clble = poc if callble is not None: if (obs == observer) and (callble == clble): - to_remove.append((p, obs, clble)) + to_remove.append(poc) else: if obs is observer: - to_remove.append((p, obs, clble)) + to_remove.append(poc) for r in to_remove: - self._observer_callables_.remove(r) + self.observers.remove(*r) def notify_observers(self, which=None, min_priority=None): """ @@ -74,21 +75,13 @@ class Observable(object): if which is None: which = self if min_priority is None: - [callble(self, which=which) for _, _, callble in self._observer_callables_] + [callble(self, which=which) for _, _, callble in self.observers] else: - for p, _, callble in self._observer_callables_: + for p, _, callble in self.observers: if p <= min_priority: break callble(self, which=which) - def _insert_sorted(self, p, o, c): - ins = 0 - for pr, _, _ in self._observer_callables_: - if p > pr: - break - ins += 1 - self._observer_callables_.insert(ins, (p, o, c)) - #=============================================================================== # Foundation framework for parameterized and param objects: #=============================================================================== @@ -192,7 +185,7 @@ class Pickleable(object): def __getstate__(self): ignore_list = ([#'_parent_', '_parent_index_', - #'_observer_callables_', + #'observers', '_param_array_', '_gradient_array_', '_fixes_', '_Cacher_wrap__cachers'] #+ self.parameter_names(recursive=False) diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py index 75085ca2..a794ab40 100644 --- a/GPy/core/parameterization/parameterized.py +++ b/GPy/core/parameterization/parameterized.py @@ -90,7 +90,7 @@ class Parameterized(Parameterizable): child_node = child.build_pydot(G) G.add_edge(pydot.Edge(node, child_node)) - for o in self._observer_callables_.keys(): + for o in self.observers.keys(): label = o.name if hasattr(o, 'name') else str(o) observed_node = pydot.Node(id(o), label=label) G.add_node(observed_node) diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py index fc52581a..b888353c 100644 --- a/GPy/testing/pickle_tests.py +++ b/GPy/testing/pickle_tests.py @@ -191,13 +191,13 @@ class Test(ListDictTestCase): par.count = 0 par.add_observer(self, self._callback, 1) pcopy = GPRegression(par.X.copy(), par.Y.copy(), kernel=par.kern.copy()) - self.assertNotIn(par._observer_callables_[0], pcopy._observer_callables_) + self.assertNotIn(par.observers[0], pcopy.observers) pcopy = par.copy() pcopy.name = "copy" self.assertTrue(par.checkgrad()) self.assertTrue(pcopy.checkgrad()) self.assertTrue(pcopy.kern.checkgrad()) - self.assertIn(par._observer_callables_[0], pcopy._observer_callables_) + self.assertIn(par.observers[0], pcopy.observers) self.assertEqual(par.count, 3) self.assertEqual(pcopy.count, 6) # 3 of each call to checkgrad From 11059fb6152b36783780eaa1ef5d4b5aadce7bd0 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Fri, 4 Apr 2014 14:20:10 +0100 Subject: [PATCH 24/33] made observers accessible and observers now only weak reference the observables --- GPy/core/parameterization/lists_and_dicts.py | 29 +++++++++++++++----- GPy/core/parameterization/param.py | 2 +- GPy/testing/pickle_tests.py | 1 + 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py index dd93c5ba..604d0a01 100644 --- a/GPy/core/parameterization/lists_and_dicts.py +++ b/GPy/core/parameterization/lists_and_dicts.py @@ -42,20 +42,29 @@ class ObservablesList(object): def __init__(self): self._poc = [] + def __getitem__(self, ind): + p,o,c = self._poc[ind] + return p, o(), c + def remove(self, priority, observable, callble): """ """ - self._poc.remove((priority, observable, callble)) + self.flush() + for i in range(len(self) - 1, -1, -1): + p,o,c = self[i] + if priority==p and observable==o and callble==c: + del self._poc[i] def __repr__(self): return self._poc.__repr__() def add(self, priority, observable, callble): - i = 0 - for i, [p, _, _] in enumerate(self._poc): - if p < priority: + ins = 0 + for pr, _, _ in self: + if priority > pr: break - self._poc.insert(i, (priority, weakref.ref(observable), callble)) + ins += 1 + self._poc.insert(ins, (priority, weakref.ref(observable), callble)) def __str__(self): ret = [] @@ -68,25 +77,31 @@ class ObservablesList(object): else: curr_pre = " "*len(pre) curr_p = p curr += curr_pre - ret.append(curr + ", ".join(map(str, [o,c]))) + ret.append(curr + ", ".join(map(repr, [o,c]))) return '\n'.join(ret) - def __iter__(self): + def flush(self): self._poc = [(p,o,c) for p,o,c in self._poc if o() is not None] + + def __iter__(self): + self.flush() for p, o, c in self._poc: if o() is not None: yield p, o(), c def __len__(self): + self.flush() return self._poc.__len__() def __deepcopy__(self, memo): + self.flush() s = ObservablesList() import copy s._poc = copy.deepcopy(self._poc, memo) return s def __getstate__(self): + self.flush() from ...util.caching import Cacher obs = [] for p, o, c in self: diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 4490a8ee..9c3d7bd3 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -324,7 +324,7 @@ class ParamConcatenation(object): if update: self.update_all_params() def values(self): - return numpy.hstack([p.param_array for p in self.params]) + return numpy.hstack([p.param_array.flat for p in self.params]) #=========================================================================== # parameter operations: #=========================================================================== diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py index b888353c..d975aaa3 100644 --- a/GPy/testing/pickle_tests.py +++ b/GPy/testing/pickle_tests.py @@ -197,6 +197,7 @@ class Test(ListDictTestCase): self.assertTrue(par.checkgrad()) self.assertTrue(pcopy.checkgrad()) self.assertTrue(pcopy.kern.checkgrad()) + import ipdb;ipdb.set_trace() self.assertIn(par.observers[0], pcopy.observers) self.assertEqual(par.count, 3) self.assertEqual(pcopy.count, 6) # 3 of each call to checkgrad From 954af5a6c20a44fcf935520206454d674b03b1b8 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Fri, 4 Apr 2014 17:00:40 +0100 Subject: [PATCH 25/33] [GPU] varDTC_gpu minibatch --- .../latent_function_inference/var_dtc_gpu.py | 185 ++++++++++++------ GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 32 ++- GPy/util/linalg_gpu.py | 9 + 3 files changed, 162 insertions(+), 64 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 59cf2b0a..e2c0e048 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -15,7 +15,7 @@ try: from scikits.cuda import cublas import pycuda.autoinit from pycuda.reduction import ReductionKernel - from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis + from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod except: pass @@ -46,6 +46,7 @@ class VarDTC_GPU(object): def _initGPUCache(self, num_inducing, output_dim, Y): if self.gpuCache == None: + ndata = Y.shape[0] self.gpuCache = {# inference_likelihood 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), @@ -60,11 +61,19 @@ class VarDTC_GPU(object): 'dL_dKmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'psi1Y_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'), 'psi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), - 'beta_gpu' :gpuarray.empty((output_dim,),np.float64,order='F'), + 'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'), 'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y).T), # DxN 'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN - 'psi2_t_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), + 'psi2_t_gpu' :gpuarray.empty((num_inducing*num_inducing*self.batchsize),np.float64,order='F'), # inference_minibatch + 'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), + 'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), + 'dL_dpsi2_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), + 'dL_dthetaL_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), + 'psi2p_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), + 'betapsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),order='F'), + 'thetaL_t_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), + 'betaYT2_gpu' :gpuarray.empty((output_dim,self.batchsize),order='F'), } self.gpuCache['ones_gpu'].fill(1.0) @@ -127,7 +136,6 @@ class VarDTC_GPU(object): psi1Y_gpu.fill(0.) psi2_gpu.fill(0.) psi0_full = 0 - psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD for n_start in xrange(0,num_data,self.batchsize): n_end = min(self.batchsize+n_start, num_data) @@ -138,7 +146,7 @@ class VarDTC_GPU(object): if ndata==self.batchsize: psi2_t_gpu_slice = psi2_t_gpu else: - psi2_t_gpu_slice = psi2_t_gpu[0:ndata] + psi2_t_gpu_slice = psi2_t_gpu[:num_inducing*num_inducing*ndata] if uncertain_inputs: psi0p_gpu = kern.psi0(Z, X_slice) psi1p_gpu = kern.psi1(Z, X_slice) @@ -148,10 +156,6 @@ class VarDTC_GPU(object): psi1p_gpu = kern.K(X_slice, Z) cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) - psi1Y_full += np.dot(psi1p_gpu.get().T,Y_slice)*beta # MxD -# print psi1Y_gpu.get() -# print psi1Y_full - print np.abs(psi1Y_gpu.get()-psi1Y_full).max() if het_noise: psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1) @@ -166,7 +170,7 @@ class VarDTC_GPU(object): sum_axis(psi2_gpu,psi2p_gpu,1,ndata) else: if het_noise: - psi1_t_gpu = psi2_t_gpu_slice[:,:,0] + psi1_t_gpu = psi2_t_gpu_slice[:,num_inducing*ndata] mul_bcast(psi1_t_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, 1.0, psi1p_gpu.gpudata, ndata, psi1_t_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) else: @@ -181,7 +185,7 @@ class VarDTC_GPU(object): psi2_full = np.zeros((num_inducing,num_inducing),order='F') psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD psi0_full = 0 - YRY_full = 0 +# YRY_full = 0 for n_start in xrange(0,num_data,self.batchsize): n_end = min(self.batchsize+n_start, num_data) @@ -199,7 +203,7 @@ class VarDTC_GPU(object): beta_slice = beta[n_start:n_end] psi0_full += (beta_slice*psi0).sum() psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD - YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() +# YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() else: psi0_full += psi0.sum() psi1Y_full += np.dot(psi1.T,Y_slice) # MxD @@ -219,7 +223,7 @@ class VarDTC_GPU(object): psi0_full *= beta psi1Y_full *= beta psi2_full *= beta - YRY_full = trYYT*beta +# YRY_full = trYYT*beta psi1Y_gpu.set(psi1Y_full) psi2_gpu.set(psi2_full) @@ -302,10 +306,6 @@ class VarDTC_GPU(object): cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1) # print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() - - # Cache intermediate results - self.midRes['dL_dpsi2R'] = dL_dpsi2R_gpu.get() - self.midRes['v'] = v_gpu.get() #logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") @@ -351,18 +351,15 @@ class VarDTC_GPU(object): """ num_data, output_dim = Y.shape + num_inducing = Z.shape[0] if isinstance(X, VariationalPosterior): uncertain_inputs = True else: uncertain_inputs = False - #see whether we've got a different noise variance for each datum beta = 1./np.fmax(likelihood.variance, 1e-6) het_noise = beta.size > 1 - # VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency! - #self.YYTfactor = beta*self.get_YYTfactor(Y) - YYT_factor = Y n_start = self.batch_pos n_end = min(self.batchsize+n_start, num_data) @@ -373,68 +370,144 @@ class VarDTC_GPU(object): isEnd = False self.batch_pos = n_end - num_slice = n_end-n_start - Y_slice = YYT_factor[n_start:n_end] + nSlice = n_end-n_start + Y_slice = Y[n_start:n_end] X_slice = X[n_start:n_end] if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) + psi0p_gpu = kern.psi0(Z, X_slice) + psi1p_gpu = kern.psi1(Z, X_slice) + psi2p_gpu = kern.psi2(Z, X_slice) else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - psi2 = None + psi0p_gpu = kern.Kdiag(X_slice) + psi1p_gpu = kern.K(X_slice, Z) if het_noise: beta = beta[n_start:n_end] - betaY = beta*Y_slice - betapsi1 = np.einsum('n,nm->nm',beta,psi1) - - betaY_gpu = gpuarray.to_gpu(betaY) - betapsi1_gpu = gpuarray.to_gpu(betapsi1) - +# betapsi1 = np.einsum('n,nm->nm',beta,psi1) +# +# # betaY_gpu = gpuarray.to_gpu(betaY) +# betapsi1_gpu = gpuarray.to_gpu(betapsi1) + #====================================================================== - # Load Intermediate Results + # Prepare gpu memory #====================================================================== - dL_dpsi2R = self.midRes['dL_dpsi2R'] - v = self.midRes['v'] + dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] + v_gpu = self.gpuCache['v_gpu'] + betaYT_gpu = self.gpuCache['betaYT_gpu'] + beta_gpu = self.gpuCache['beta_gpu'] + dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu'] + dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] + dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] + dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu'] + psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) + psi2p_gpu = self.gpuCache['psi2p_gpu'] + betapsi1_gpu = self.gpuCache['betapsi1_gpu'] + thetaL_t_gpu = self.gpuCache['thetaL_t_gpu'] + betaYT2_gpu = self.gpuCache['betaYT2_gpu'] + + betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] + beta_gpu_slice = beta_gpu[n_start:n_end] + + # Adjust to the batch size + if dL_dpsi0_gpu.shape[0] < nSlice: + betaYT2_gpu = betaYT2_gpu[:,:nSlice] + dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice] + dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) + dL_dpsi2_gpu = dL_dpsi2_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) + dL_dthetaL_gpu = dL_dthetaL_gpu.ravel()[:nSlice] + psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) + thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice] + betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) + if not uncertain_inputs: + psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) + + mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) #====================================================================== # Compute dL_dpsi #====================================================================== - dL_dpsi0 = -0.5 * output_dim * (beta * np.ones((n_end-n_start,))) + dL_dpsi0_gpu.fill(0.) + cublas.cublasDaxpy(self.cublas_handle, dL_dpsi0_gpu.size, output_dim/(-2.), beta_gpu_slice.gpudata, 1, dL_dpsi0_gpu.gpudata, 1) +# dL_dpsi0_gpu = -0.5 * output_dim * (beta * np.ones((n_end-n_start,))) - dL_dpsi1 = np.dot(betaY,v.T) + cublas.cublasDgemm(self.cublas_handle, 'T', 'T', nSlice, num_inducing, output_dim, 1.0, betaYT_gpu_slice.gpudata, output_dim, v_gpu.gpudata, num_inducing, 0., dL_dpsi1_gpu.gpudata, nSlice) +# dL_dpsi1 = np.dot(betaY,v.T) if uncertain_inputs: - dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R) + outer_prod(dL_dpsi2_gpu,beta_gpu_slice,dL_dpsi2R_gpu,beta_gpu_slice.size) +# dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R) else: - dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2. - dL_dpsi2 = None + cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice) +# dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2. #====================================================================== # Compute dL_dthetaL #====================================================================== + + if not uncertain_inputs: + join_prod(psi2p_gpu,psi1p_gpu,psi1p_gpu,nSlice,num_inducing) - if het_noise: - if uncertain_inputs: - psiR = np.einsum('mo,nmo->n',dL_dpsi2R,psi2) - else: - psiR = np.einsum('nm,no,mo->n',psi1,psi1,dL_dpsi2R) - - dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) + mul_bcast_first(psi2R_gpu,dL_dpsi2R_gpu,psi2p_gpu,nSlice) + + + dL_dthetaL_gpu.fill(0.) + + cublas.cublasDcopy(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata, 1, betaYT2_gpu.gpudata, 1) + mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT2_gpu,betaYT2_gpu.size) + cublas.cublasDscal(self.cublas_handle, betaYT2_gpu.size, 0.5, betaYT2_gpu.gpudata, 1) + sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) + + cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/(-2.0), beta_gpu_slice.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) + cublas.cublasDcopy(self.cublas_handle, beta_gpu_slice.size, beta_gpu_slice.gpudata, 1, thetaL_t_gpu.gpudata, 1) + mul_bcast(thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu.size) + mul_bcast(thetaL_t_gpu,thetaL_t_gpu,psi0p_gpu,thetaL_t_gpu.size) + cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/2.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) + + thetaL_t_gpu.fill(0.) + sum_axis(thetaL_t_gpu, psi2R_gpu, nSlice, num_inducing*num_inducing) + mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size) + mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size) + cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) + + cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, 1.0, betapsi1_gpu.gpudata, nSlice, v_gpu.gpudata, num_inducing, 0.0, betaYT2_gpu.gpudata, output_dim) + mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size) + sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) + +# if het_noise: +# if uncertain_inputs: +# psiR = np.einsum('mo,nmo->n',dL_dpsi2R,psi2) +# else: +# psiR = np.einsum('nm,no,mo->n',psi1,psi1,dL_dpsi2R) +# +# dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) +# else: +# if uncertain_inputs: +# psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) +# else: +# psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) +# +# dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum() + + + if kern.useGPU: + dL_dpsi0 = dL_dpsi0_gpu + dL_dpsi1 = dL_dpsi1_gpu else: - if uncertain_inputs: - psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) + dL_dpsi0 = dL_dpsi0_gpu.get() + dL_dpsi1 = dL_dpsi1_gpu.get() + if uncertain_inputs: + if kern.useGPU: + dL_dpsi2 = dL_dpsi2_gpu else: - psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) - - dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum() - + dL_dpsi2 = dL_dpsi2_gpu.get() + if het_noise: + dL_dthetaL = dL_dthetaL_gpu.get() + else: + dL_dthetaL = gpuarray.sum(dL_dthetaL_gpu).get() if uncertain_inputs: grad_dict = {'dL_dpsi0':dL_dpsi0, 'dL_dpsi1':dL_dpsi1, diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 263884dd..bafe85ce 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -258,13 +258,17 @@ class PSICOMP_SSRBF(object): def __init__(self): self.cublas_handle = cublas.cublasCreate() self.gpuCache = None + self.gpuCacheAll = None def _initGPUCache(self, N, M, Q): - if self.gpuCache and self.gpuCache['mu_gpu'].shape[0]!=N: + if self.gpuCache!=None and self.gpuCache['mu_gpu'].shape[0] == N: + return + + if self.gpuCacheAll!=None and self.gpuCacheAll['mu_gpu'].shape[0] reallocate self._releaseMemory() - if self.gpuCache == None: - self.gpuCache = { + if self.gpuCacheAll == None: + self.gpuCacheAll = { 'l_gpu' :gpuarray.empty((Q,),np.float64,order='F'), 'Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'), 'mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), @@ -304,13 +308,24 @@ class PSICOMP_SSRBF(object): 'grad_S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), 'grad_gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'), } + self.gpuCache = self.gpuCacheAll + elif self.gpuCacheAll['mu_gpu'].shape[0]==N: + self.gpuCache = self.gpuCacheAll + else: + # remap to a smaller cache + self.gpuCache = self.gpuCacheAll.copy() + Nlist=['mu_gpu','S_gpu','gamma_gpu','logGamma_gpu','log1Gamma_gpu','logpsi1denom_gpu','logpsi2denom_gpu','psi0_gpu','psi1_gpu','psi2_gpu', + 'psi1_neq_gpu','psi1exp1_gpu','psi1exp2_gpu','dpsi1_dvar_gpu','dpsi1_dl_gpu','dpsi1_dZ_gpu','dpsi1_dgamma_gpu','dpsi1_dmu_gpu', + 'dpsi1_dS_gpu','psi2_neq_gpu','psi2exp1_gpu','dpsi2_dvar_gpu','dpsi2_dl_gpu','dpsi2_dZ_gpu','dpsi2_dgamma_gpu','dpsi2_dmu_gpu','dpsi2_dS_gpu','grad_mu_gpu','grad_S_gpu','grad_gamma_gpu',] + oldN = self.gpuCacheAll['mu_gpu'].shape[0] + for v in Nlist: + u = self.gpuCacheAll[v] + self.gpuCache[v] = u.ravel()[:u.size/oldN*N].reshape(*((N,)+u.shape[1:])) def _releaseMemory(self): - if not self.gpuCache: - for k,v in self.gpuCache: - v.gpudata.free() - del v - del self.gpuCache + if self.gpuCacheAll!=None: + [v.gpudata.free() for v in self.gpuCacheAll.values()] + self.gpuCacheAll = None self.gpuCache = None def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): @@ -351,6 +366,7 @@ class PSICOMP_SSRBF(object): comp_logpsidenom(logpsi1denom_gpu, S_gpu,l_gpu,1.0,N) comp_logpsidenom(logpsi2denom_gpu, S_gpu,l_gpu,2.0,N) + psi0_gpu.fill(variance) comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 039b0d62..6062d135 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -31,6 +31,9 @@ try: # multiplication with broadcast on the last dimension (out = shorter[:,None]*longer) mul_bcast = ElementwiseKernel("double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast") + # multiplication with broadcast on the first dimension (out = shorter[None,:]*longer) + mul_bcast_first = ElementwiseKernel("double *out, double *shorter, double *longer, int first_dim", "out[i] = longer[i]*shorter[i/first_dim]", "mul_bcast") + # sum through the middle dimension (size_2) of a 3D matrix (size_1, size_2, size_3) sum_axis = ElementwiseKernel("double *out, double *in, int size_1, int size_2", "out[i] += sum_axis_element(in, size_1, size_2, i)", "sum_axis",preamble=""" __device__ double sum_axis_element(double *in, int size_1, int size_2, int idx) @@ -45,5 +48,11 @@ try: } """) + # the outer product between two vectors (out = np.dot(v1,v2.T)) + outer_prod = ElementwiseKernel("double *out, double *v1, double *v2, int v1_size", "out[i] = v1[i%v1_size]*v2[i/v1_size]", "outer_prod") + + # the outer product between two vectors (out = np.einsum('na,nb->nab',m1,m2) a=dim1, b=dim2 ) + join_prod = ElementwiseKernel("double *out, double *m1, double *m2, int dim1, int dim2", "out[i] = m1[(i%dim1)*dim1+(i%(dim1*dim2))/dim1]*m2[(i%dim1)*dim1+i/(dim1*dim2)]", "join_prod") + except: pass From 7a74c0b80d863dbcdd7a706b71d44c0eb012612f Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Fri, 4 Apr 2014 18:02:53 +0100 Subject: [PATCH 26/33] [GPU] varDTC_gpu almost done --- .../latent_function_inference/var_dtc_gpu.py | 63 +++++++++++-------- GPy/kern/_src/rbf.py | 20 +++--- 2 files changed, 46 insertions(+), 37 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index e2c0e048..e70f71ba 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -62,7 +62,7 @@ class VarDTC_GPU(object): 'psi1Y_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'), 'psi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'), - 'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y).T), # DxN + 'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN 'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN 'psi2_t_gpu' :gpuarray.empty((num_inducing*num_inducing*self.batchsize),np.float64,order='F'), # inference_minibatch @@ -70,10 +70,12 @@ class VarDTC_GPU(object): 'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), 'dL_dpsi2_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), 'dL_dthetaL_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), - 'psi2p_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), - 'betapsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),order='F'), + 'betapsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), 'thetaL_t_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), - 'betaYT2_gpu' :gpuarray.empty((output_dim,self.batchsize),order='F'), + 'betaYT2_gpu' :gpuarray.empty((output_dim,self.batchsize),np.float64,order='F'), + 'psi0p_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), + 'psi1p_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), + 'psi2p_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'), } self.gpuCache['ones_gpu'].fill(1.0) @@ -371,24 +373,38 @@ class VarDTC_GPU(object): self.batch_pos = n_end nSlice = n_end-n_start - Y_slice = Y[n_start:n_end] X_slice = X[n_start:n_end] - if uncertain_inputs: - psi0p_gpu = kern.psi0(Z, X_slice) - psi1p_gpu = kern.psi1(Z, X_slice) - psi2p_gpu = kern.psi2(Z, X_slice) + if kern.useGPU: + if uncertain_inputs: + psi0p_gpu = kern.psi0(Z, X_slice) + psi1p_gpu = kern.psi1(Z, X_slice) + psi2p_gpu = kern.psi2(Z, X_slice) + else: + psi0p_gpu = kern.Kdiag(X_slice) + psi1p_gpu = kern.K(X_slice, Z) + psi2p_gpu = self.gpuCache['psi2p_gpu'] + if psi2p_gpu.shape[0] > nSlice: + psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) else: - psi0p_gpu = kern.Kdiag(X_slice) - psi1p_gpu = kern.K(X_slice, Z) - - if het_noise: - beta = beta[n_start:n_end] + if uncertain_inputs: + psi0 = kern.psi0(Z, X_slice) + psi1 = kern.psi1(Z, X_slice) + psi2 = kern.psi2(Z, X_slice) + else: + psi0 = kern.Kdiag(X_slice) + psi1 = kern.K(X_slice, Z) -# betapsi1 = np.einsum('n,nm->nm',beta,psi1) -# -# # betaY_gpu = gpuarray.to_gpu(betaY) -# betapsi1_gpu = gpuarray.to_gpu(betapsi1) + psi0p_gpu = self.gpuCache['psi0p_gpu'] + psi1p_gpu = self.gpuCache['psi1p_gpu'] + psi2p_gpu = self.gpuCache['psi2p_gpu'] + if psi0p_gpu > nSlice: + psi0p_gpu = psi0p_gpu[:nSlice] + psi1p_gpu = psi1p_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) + psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) + psi0p_gpu.get(psi0) + psi1p_gpu.get(psi1) + psi2p_gpu.get(psi2) #====================================================================== # Prepare gpu memory @@ -403,7 +419,6 @@ class VarDTC_GPU(object): dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu'] psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - psi2p_gpu = self.gpuCache['psi2p_gpu'] betapsi1_gpu = self.gpuCache['betapsi1_gpu'] thetaL_t_gpu = self.gpuCache['thetaL_t_gpu'] betaYT2_gpu = self.gpuCache['betaYT2_gpu'] @@ -412,7 +427,7 @@ class VarDTC_GPU(object): beta_gpu_slice = beta_gpu[n_start:n_end] # Adjust to the batch size - if dL_dpsi0_gpu.shape[0] < nSlice: + if dL_dpsi0_gpu.shape[0] > nSlice: betaYT2_gpu = betaYT2_gpu[:,:nSlice] dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice] dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) @@ -421,8 +436,6 @@ class VarDTC_GPU(object): psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice] betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - if not uncertain_inputs: - psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size) @@ -432,17 +445,13 @@ class VarDTC_GPU(object): dL_dpsi0_gpu.fill(0.) cublas.cublasDaxpy(self.cublas_handle, dL_dpsi0_gpu.size, output_dim/(-2.), beta_gpu_slice.gpudata, 1, dL_dpsi0_gpu.gpudata, 1) -# dL_dpsi0_gpu = -0.5 * output_dim * (beta * np.ones((n_end-n_start,))) cublas.cublasDgemm(self.cublas_handle, 'T', 'T', nSlice, num_inducing, output_dim, 1.0, betaYT_gpu_slice.gpudata, output_dim, v_gpu.gpudata, num_inducing, 0., dL_dpsi1_gpu.gpudata, nSlice) -# dL_dpsi1 = np.dot(betaY,v.T) if uncertain_inputs: outer_prod(dL_dpsi2_gpu,beta_gpu_slice,dL_dpsi2R_gpu,beta_gpu_slice.size) -# dL_dpsi2 = np.einsum('n,mo->nmo',beta * np.ones((n_end-n_start,)),dL_dpsi2R) else: cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice) -# dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2. #====================================================================== # Compute dL_dthetaL @@ -473,7 +482,7 @@ class VarDTC_GPU(object): mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size) cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1) - cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, 1.0, betapsi1_gpu.gpudata, nSlice, v_gpu.gpudata, num_inducing, 0.0, betaYT2_gpu.gpudata, output_dim) + cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, -1.0, v_gpu.gpudata, num_inducing, betapsi1_gpu.gpudata, nSlice, 0.0, betaYT2_gpu.gpudata, output_dim) mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size) sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index e5da3d97..2534ad9b 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -74,10 +74,10 @@ class RBF(Stationary): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): if self.useGPU: - dL_dpsi0_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi0)) - dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) - dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) - self.psicomp.update_gradients_expectations(dL_dpsi0_gpu, dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) +# dL_dpsi0_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi0)) +# dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) +# dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + self.psicomp.update_gradients_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) else: _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) @@ -139,9 +139,9 @@ class RBF(Stationary): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): if self.useGPU: - dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) - dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) - return self.psicomp.gradients_Z_expectations(dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) +# dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) +# dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + return self.psicomp.gradients_Z_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) else: _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) @@ -177,9 +177,9 @@ class RBF(Stationary): # Spike-and-Slab GPLVM if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): if self.useGPU: - dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) - dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) - return self.psicomp.gradients_qX_expectations(dL_dpsi1_gpu, dL_dpsi2_gpu, self.variance, self.lengthscale, Z, variational_posterior) +# dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1)) +# dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2)) + return self.psicomp.gradients_qX_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior) else: ndata = variational_posterior.mean.shape[0] From 934ecc7e9560fe27debb513b888a95abd128146a Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Fri, 4 Apr 2014 19:03:35 +0100 Subject: [PATCH 27/33] [GPU] varDTC_gpu bug fix --- .../latent_function_inference/var_dtc_gpu.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index e70f71ba..793d9bf7 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -30,7 +30,7 @@ class VarDTC_GPU(object): """ const_jitter = np.float64(1e-6) - def __init__(self, batchsize, limit=1): + def __init__(self, batchsize=None, limit=1): self.batchsize = batchsize @@ -45,6 +45,8 @@ class VarDTC_GPU(object): self.gpuCache = None def _initGPUCache(self, num_inducing, output_dim, Y): + if self.batchsize==None: + self.batchsize = Y.shape[0] if self.gpuCache == None: ndata = Y.shape[0] self.gpuCache = {# inference_likelihood @@ -398,13 +400,14 @@ class VarDTC_GPU(object): psi0p_gpu = self.gpuCache['psi0p_gpu'] psi1p_gpu = self.gpuCache['psi1p_gpu'] psi2p_gpu = self.gpuCache['psi2p_gpu'] - if psi0p_gpu > nSlice: + if psi0p_gpu.shape[0] > nSlice: psi0p_gpu = psi0p_gpu[:nSlice] psi1p_gpu = psi1p_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing) - psi0p_gpu.get(psi0) - psi1p_gpu.get(psi1) - psi2p_gpu.get(psi2) + psi0p_gpu.set(np.asfortranarray(psi0)) + psi1p_gpu.set(np.asfortranarray(psi1)) + if uncertain_inputs: + psi2p_gpu.set(np.asfortranarray(psi2)) #====================================================================== # Prepare gpu memory From 5e01b94d37e8d2e4d4f57ba7ea61761bb38dd1c4 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Fri, 4 Apr 2014 19:30:49 +0100 Subject: [PATCH 28/33] [GPU] caching not working --- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index bafe85ce..14414222 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -328,6 +328,7 @@ class PSICOMP_SSRBF(object): self.gpuCacheAll = None self.gpuCache = None + @Cache_this(limit=1) def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): """Compute Psi statitsitcs""" if isinstance(lengthscale, np.ndarray) and len(lengthscale)>1: @@ -370,9 +371,9 @@ class PSICOMP_SSRBF(object): comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) -# return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get() return psi0_gpu, psi1_gpu, psi2_gpu - + + @Cache_this(limit=1) def _psiDercomputations(self, variance, lengthscale, Z, mu, S, gamma): """Compute the derivatives w.r.t. Psi statistics""" N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1] From 9d312ab8ffca515650173455be3b4416e6673e66 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Sat, 5 Apr 2014 00:10:35 +0100 Subject: [PATCH 29/33] bug fix: caching.py w.r.t. ignore_args --- GPy/util/caching.py | 1 + 1 file changed, 1 insertion(+) diff --git a/GPy/util/caching.py b/GPy/util/caching.py index ced56727..bb162ee3 100644 --- a/GPy/util/caching.py +++ b/GPy/util/caching.py @@ -66,6 +66,7 @@ class Cacher(object): #first make sure the depth limit isn't exceeded if len(self.cached_inputs) == self.limit: args_ = self.cached_inputs.pop(0) + args_ = [a for i,a in enumerate(args_) if i not in self.ignore_args and i not in self.force_kwargs] [a.remove_observer(self, self.on_cache_changed) for a in args_ if a is not None] self.inputs_changed.pop(0) self.cached_outputs.pop(0) From f776db45dfb3f9fb3667e3fc0a06596b52730731 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Mon, 7 Apr 2014 10:18:03 +0100 Subject: [PATCH 30/33] [GPU] psi varDTC ready --- .../latent_function_inference/var_dtc_parallel.py | 9 +++++---- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 4 ++-- GPy/util/caching.py | 1 + 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py index 4b29b16a..87236e2a 100644 --- a/GPy/inference/latent_function_inference/var_dtc_parallel.py +++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py @@ -302,18 +302,19 @@ def update_gradients(model): while not isEnd: isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y) if isinstance(model.X, VariationalPosterior): + X_slice = model.X[n_range[0]:n_range[1]] #gradients w.r.t. kernel - model.kern.update_gradients_expectations(variational_posterior=model.X[n_range[0]:n_range[1]], Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) + model.kern.update_gradients_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) kern_grad += model.kern.gradient #gradients w.r.t. Z model.Z.gradient[:,model.kern.active_dims] += model.kern.gradients_Z_expectations( - grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=model.X[n_range[0]:n_range[1]]) + grad_dict['dL_dpsi1'], grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice) #gradients w.r.t. posterior parameters of X - X_grad = model.kern.gradients_qX_expectations(variational_posterior=model.X[n_range[0]:n_range[1]], Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) - model.set_X_gradients(model.X[n_range[0]:n_range[1]], X_grad) + X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2']) + model.set_X_gradients(X_slice, X_grad) if het_noise: dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL'] diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 14414222..a695d14e 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -328,7 +328,7 @@ class PSICOMP_SSRBF(object): self.gpuCacheAll = None self.gpuCache = None - @Cache_this(limit=1) + @Cache_this(limit=1,ignore_args=(0,)) def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): """Compute Psi statitsitcs""" if isinstance(lengthscale, np.ndarray) and len(lengthscale)>1: @@ -373,7 +373,7 @@ class PSICOMP_SSRBF(object): return psi0_gpu, psi1_gpu, psi2_gpu - @Cache_this(limit=1) + @Cache_this(limit=1,ignore_args=(0,)) def _psiDercomputations(self, variance, lengthscale, Z, mu, S, gamma): """Compute the derivatives w.r.t. Psi statistics""" N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1] diff --git a/GPy/util/caching.py b/GPy/util/caching.py index 282c9f8c..676c3ab8 100644 --- a/GPy/util/caching.py +++ b/GPy/util/caching.py @@ -66,6 +66,7 @@ class Cacher(object): #first make sure the depth limit isn't exceeded if len(self.cached_inputs) == self.limit: args_ = self.cached_inputs.pop(0) + args_ = [a for i,a in enumerate(args_) if i not in self.ignore_args and i not in self.force_kwargs] [a.remove_observer(self, self.on_cache_changed) for a in args_ if a is not None] self.inputs_changed.pop(0) self.cached_outputs.pop(0) From 01860455afa0d775a1b8e79039232d5be407e3e9 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Mon, 7 Apr 2014 11:55:46 +0100 Subject: [PATCH 31/33] [GPU] add automatic batchsize estimation --- .../latent_function_inference/var_dtc_gpu.py | 73 +++++++++++-------- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 18 ++++- GPy/util/gpu_init.py | 16 ++++ GPy/util/linalg_gpu.py | 4 +- 4 files changed, 75 insertions(+), 36 deletions(-) create mode 100644 GPy/util/gpu_init.py diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 793d9bf7..a3fe0782 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -9,12 +9,12 @@ import numpy as np from ...util.misc import param_to_array log_2_pi = np.log(2*np.pi) +from ...util import gpu_init +assert gpu_init.initSuccess + try: - import scikits.cuda.linalg as culinalg import pycuda.gpuarray as gpuarray from scikits.cuda import cublas - import pycuda.autoinit - from pycuda.reduction import ReductionKernel from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod except: pass @@ -30,25 +30,24 @@ class VarDTC_GPU(object): """ const_jitter = np.float64(1e-6) - def __init__(self, batchsize=None, limit=1): + def __init__(self, batchsize=None, gpu_memory=4., limit=1): self.batchsize = batchsize + self.gpu_memory = gpu_memory self.midRes = {} self.batch_pos = 0 # the starting position of the current mini-batch - # Initialize GPU environment - culinalg.init() - self.cublas_handle = cublas.cublasCreate() + self.cublas_handle = gpu_init.cublas_handle # Initialize GPU caches self.gpuCache = None - def _initGPUCache(self, num_inducing, output_dim, Y): + def _initGPUCache(self, kern, num_inducing, input_dim, output_dim, Y): + ndata = Y.shape[0] if self.batchsize==None: - self.batchsize = Y.shape[0] + self.batchsize = self._estimateBatchSize(kern, ndata, num_inducing, input_dim, output_dim) if self.gpuCache == None: - ndata = Y.shape[0] self.gpuCache = {# inference_likelihood 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), @@ -83,6 +82,34 @@ class VarDTC_GPU(object): YT_gpu = self.gpuCache['YT_gpu'] self._trYYT = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, YT_gpu.gpudata, 1, YT_gpu.gpudata, 1) + + def _estimateMemoryOccupation(self, N, M, D): + """ + Estimate the best batch size. + N - the number of total datapoints + M - the number of inducing points + D - the number of observed (output) dimensions + return: the constant memory size, the memory occupation of batchsize=1 + unit: GB + """ + return (M+9.*M*M+3*M*D+N+2.*N*D)*8./1024./1024./1024., (4.+3.*M+D+3.*M*M)*8./1024./1024./1024. + + def _estimateBatchSize(self, kern, N, M, Q, D): + """ + Estimate the best batch size. + N - the number of total datapoints + M - the number of inducing points + D - the number of observed (output) dimensions + return: the constant memory size, the memory occupation of batchsize=1 + unit: GB + """ + if kern.useGPU: + x0,x1 = kern.psicomp.estimateMemoryOccupation(N,M,Q) + else: + x0, x1 = 0.,0. + y0, y1 = self._estimateMemoryOccupation(N, M, D) + + return int((self.gpu_memory-y0-x0)/(x1+y1)) def _get_YYTfactor(self, Y): """ @@ -104,10 +131,10 @@ class VarDTC_GPU(object): Cached intermediate results: Kmm, KmmInv, """ - num_inducing = Z.shape[0] + num_inducing, input_dim = Z.shape[0], Z.shape[1] num_data, output_dim = Y.shape - self._initGPUCache(num_inducing, output_dim, Y) + self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y) if isinstance(X, VariationalPosterior): uncertain_inputs = True @@ -238,7 +265,7 @@ class VarDTC_GPU(object): Kmm = kern.K(Z).copy() Kmm_gpu = self.gpuCache['Kmm_gpu'] - Kmm_gpu.set(Kmm) + Kmm_gpu.set(np.asfortranarray(Kmm)) diag.add(Kmm, self.const_jitter) ones_gpu = self.gpuCache['ones_gpu'] cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1) @@ -310,9 +337,7 @@ class VarDTC_GPU(object): cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1) # print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() - - #logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") - + #====================================================================== # Compute log-likelihood #====================================================================== @@ -489,22 +514,6 @@ class VarDTC_GPU(object): mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size) sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) -# if het_noise: -# if uncertain_inputs: -# psiR = np.einsum('mo,nmo->n',dL_dpsi2R,psi2) -# else: -# psiR = np.einsum('nm,no,mo->n',psi1,psi1,dL_dpsi2R) -# -# dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) -# else: -# if uncertain_inputs: -# psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) -# else: -# psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) -# -# dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum() - - if kern.useGPU: dL_dpsi0 = dL_dpsi0_gpu dL_dpsi1 = dL_dpsi1_gpu diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index a695d14e..8d2f24bc 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -8,11 +8,12 @@ The package for the psi statistics computation on GPU import numpy as np from GPy.util.caching import Cache_this +from ....util import gpu_init +assert gpu_init.initSuccess + try: - import scikits.cuda.linalg as culinalg import pycuda.gpuarray as gpuarray from scikits.cuda import cublas - import pycuda.autoinit from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel from ....util import linalg_gpu @@ -256,7 +257,7 @@ except: class PSICOMP_SSRBF(object): def __init__(self): - self.cublas_handle = cublas.cublasCreate() + self.cublas_handle = gpu_init.cublas_handle self.gpuCache = None self.gpuCacheAll = None @@ -327,6 +328,17 @@ class PSICOMP_SSRBF(object): [v.gpudata.free() for v in self.gpuCacheAll.values()] self.gpuCacheAll = None self.gpuCache = None + + def estimateMemoryOccupation(self, N, M, Q): + """ + Estimate the best batch size. + N - the number of total datapoints + M - the number of inducing points + Q - the number of hidden (input) dimensions + return: the constant memory size, the memory occupation of batchsize=1 + unit: GB + """ + return (2.*Q+2.*M*Q+M*M*Q)*8./1024./1024./1024., (1.+2.*M+10.*Q+2.*M*M+8.*M*Q+7.*M*M*Q)*8./1024./1024./1024. @Cache_this(limit=1,ignore_args=(0,)) def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): diff --git a/GPy/util/gpu_init.py b/GPy/util/gpu_init.py new file mode 100644 index 00000000..917d8158 --- /dev/null +++ b/GPy/util/gpu_init.py @@ -0,0 +1,16 @@ +""" +The package for scikits.cuda initialization + +Global variables: initSuccess +providing CUBLAS handle: cublas_handle +""" + +try: + import pycuda.autoinit + from scikits.cuda import cublas + import scikits.cuda.linalg as culinalg + culinalg.init() + cublas_handle = cublas.cublasCreate() + initSuccess = True +except: + initSuccess = False \ No newline at end of file diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 6062d135..6ec4fb48 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -7,8 +7,10 @@ # import numpy as np +from ..util import gpu_init +assert gpu_init.initSuccess + try: - import pycuda.autoinit from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel From 5cfc250ad140dbeec9941c839439c844d0b6f219 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Tue, 8 Apr 2014 15:26:34 +0100 Subject: [PATCH 32/33] [SSGPLVM] add plotting class --- .../latent_function_inference/var_dtc_gpu.py | 2 +- GPy/kern/_src/kern.py | 2 +- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 2 +- GPy/kern/_src/rbf.py | 3 - GPy/models/ss_gplvm.py | 9 ++- GPy/plotting/matplot_dep/__init__.py | 2 + GPy/plotting/matplot_dep/img_plots.py | 56 +++++++++++++++++++ GPy/plotting/matplot_dep/ssgplvm.py | 29 ++++++++++ GPy/util/linalg_gpu.py | 1 - 9 files changed, 96 insertions(+), 10 deletions(-) create mode 100644 GPy/plotting/matplot_dep/img_plots.py create mode 100644 GPy/plotting/matplot_dep/ssgplvm.py diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index a3fe0782..1089fc6c 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -10,9 +10,9 @@ from ...util.misc import param_to_array log_2_pi = np.log(2*np.pi) from ...util import gpu_init -assert gpu_init.initSuccess try: + import scikits.cuda.linalg as culinalg import pycuda.gpuarray as gpuarray from scikits.cuda import cublas from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index dbe4c1f8..f871e676 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -13,7 +13,7 @@ class Kern(Parameterized): #=========================================================================== # This adds input slice support. The rather ugly code for slicing can be # found in kernel_slice_operations - __metaclass__ = KernCallsViaSlicerMeta + #__metaclass__ = KernCallsViaSlicerMeta #=========================================================================== _support_GPU=False def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw): diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 8d2f24bc..f49dc52a 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -9,7 +9,6 @@ import numpy as np from GPy.util.caching import Cache_this from ....util import gpu_init -assert gpu_init.initSuccess try: import pycuda.gpuarray as gpuarray @@ -257,6 +256,7 @@ except: class PSICOMP_SSRBF(object): def __init__(self): + assert gpu_init.initSuccess, "GPU initialization failed!" self.cublas_handle = gpu_init.cublas_handle self.gpuCache = None self.gpuCacheAll = None diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index e08d94f9..e0071fb9 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -11,9 +11,6 @@ from ...core.parameterization import variational from psi_comp import ssrbf_psi_comp from psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF -import pycuda.gpuarray as gpuarray -import pycuda.autoinit - class RBF(Stationary): """ Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel: diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py index 55ee573c..57be302a 100644 --- a/GPy/models/ss_gplvm.py +++ b/GPy/models/ss_gplvm.py @@ -30,9 +30,12 @@ class SSGPLVM(SparseGP): def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10, Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike-and-Slab GPLVM', group_spike=False, **kwargs): - if X == None: # The mean of variational approximation (mu) + if X == None: from ..util.initialization import initialize_latent - X = initialize_latent(init, input_dim, Y) + X, fracs = initialize_latent(init, input_dim, Y) + else: + fracs = np.ones(input_dim) + self.init = init if X_variance is None: # The variance of the variational approximation (S) @@ -52,7 +55,7 @@ class SSGPLVM(SparseGP): likelihood = Gaussian() if kernel is None: - kernel = kern.SSRBF(input_dim) + kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim) pi = np.empty((input_dim)) pi[:] = 0.5 diff --git a/GPy/plotting/matplot_dep/__init__.py b/GPy/plotting/matplot_dep/__init__.py index e2706903..f493513a 100644 --- a/GPy/plotting/matplot_dep/__init__.py +++ b/GPy/plotting/matplot_dep/__init__.py @@ -15,3 +15,5 @@ import latent_space_visualizations import netpbmfile import inference_plots import maps +import img_plots +from ssgplvm import SSGPLVM_plot diff --git a/GPy/plotting/matplot_dep/img_plots.py b/GPy/plotting/matplot_dep/img_plots.py new file mode 100644 index 00000000..fbaaa237 --- /dev/null +++ b/GPy/plotting/matplot_dep/img_plots.py @@ -0,0 +1,56 @@ +""" +The module contains the tools for ploting 2D image visualizations +""" + +import numpy as np +from matplotlib.cm import jet + +width_max = 15 +height_max = 12 + +def _calculateFigureSize(x_size, y_size, fig_ncols, fig_nrows, pad): + width = (x_size*fig_ncols+pad*(fig_ncols-1)) + height = (y_size*fig_nrows+pad*(fig_nrows-1)) + if width > float(height)/height_max*width_max: + return (width_max, float(width_max)/width*height) + else: + return (float(height_max)/height*width, height_max) + +def plot_2D_images(figure, arr, symmetric=False, pad=None, zoom=None, mode=None, interpolation='nearest'): + ax = figure.add_subplot(111) + if len(arr.shape)==2: + arr = arr.reshape(*((1,)+arr.shape)) + fig_num = arr.shape[0] + y_size = arr.shape[1] + x_size = arr.shape[2] + fig_ncols = int(np.ceil(np.sqrt(fig_num))) + fig_nrows = int(np.ceil((float)(fig_num)/fig_ncols)) + if pad==None: + pad = max(int(min(y_size,x_size)/10),1) + + figsize = _calculateFigureSize(x_size, y_size, fig_ncols, fig_nrows, pad) + figure.set_size_inches(figsize,forward=True) + #figure.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.95) + + if symmetric: + # symmetric around zero: fix zero as the middle color + mval = max(abs(arr.max()),abs(arr.min())) + arr = arr/(2.*mval)+0.5 + else: + minval,maxval = arr.max(),arr.min() + arr = (arr-minval)/(maxval-minval) + + if mode=='L': + arr_color = np.empty(arr.shape+(3,)) + arr_color[:] = arr.reshape(*(arr.shape+(1,))) + elif mode==None or mode=='jet': + arr_color = jet(arr) + + buf = np.ones((y_size*fig_nrows+pad*(fig_nrows-1), x_size*fig_ncols+pad*(fig_ncols-1), 3),dtype=arr.dtype) + + for y in xrange(fig_nrows): + for x in xrange(fig_ncols): + if y*fig_ncols+x Date: Wed, 9 Apr 2014 12:22:46 +0100 Subject: [PATCH 33/33] fix stick man example --- GPy/core/gp.py | 2 +- GPy/examples/dimensionality_reduction.py | 6 +++--- .../exact_gaussian_inference.py | 2 +- GPy/inference/latent_function_inference/var_dtc_gpu.py | 4 +++- GPy/plotting/matplot_dep/img_plots.py | 4 ++-- GPy/plotting/matplot_dep/visualize.py | 9 +++++---- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/GPy/core/gp.py b/GPy/core/gp.py index 490bcc72..692e5d01 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -121,7 +121,7 @@ class GP(Model): If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew. This is to allow for different normalizations of the output dimensions. - """ + """ #predict the latent function values mu, var = self._raw_predict(Xnew, full_cov=full_cov) diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index 07623d6b..c1911e75 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -409,12 +409,12 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True): # optimize m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel) if optimize: m.optimize(messages=verbose, max_f_eval=10000) - if plot and GPy.plotting.matplot_dep.visualize.visual_available: + if plot: plt.clf ax = m.plot_latent() - y = m.likelihood.Y[0, :] + y = m.Y[0, :] data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect']) - GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) + vis = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, latent_axes=ax) raw_input('Press enter to finish') return m diff --git a/GPy/inference/latent_function_inference/exact_gaussian_inference.py b/GPy/inference/latent_function_inference/exact_gaussian_inference.py index 074b67a6..c0177e9f 100644 --- a/GPy/inference/latent_function_inference/exact_gaussian_inference.py +++ b/GPy/inference/latent_function_inference/exact_gaussian_inference.py @@ -32,7 +32,7 @@ class ExactGaussianInference(object): return Y else: #if Y in self.cache, return self.Cache[Y], else store Y in cache and return L. - print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!" + #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!" return Y def inference(self, kern, X, likelihood, Y, Y_metadata=None): diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 1089fc6c..9b2da1c9 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -109,7 +109,9 @@ class VarDTC_GPU(object): x0, x1 = 0.,0. y0, y1 = self._estimateMemoryOccupation(N, M, D) - return int((self.gpu_memory-y0-x0)/(x1+y1)) + opt_batchsize = min(int((self.gpu_memory-y0-x0)/(x1+y1)), N) + + return opt_batchsize def _get_YYTfactor(self, Y): """ diff --git a/GPy/plotting/matplot_dep/img_plots.py b/GPy/plotting/matplot_dep/img_plots.py index fbaaa237..21dbd64f 100644 --- a/GPy/plotting/matplot_dep/img_plots.py +++ b/GPy/plotting/matplot_dep/img_plots.py @@ -29,7 +29,7 @@ def plot_2D_images(figure, arr, symmetric=False, pad=None, zoom=None, mode=None, pad = max(int(min(y_size,x_size)/10),1) figsize = _calculateFigureSize(x_size, y_size, fig_ncols, fig_nrows, pad) - figure.set_size_inches(figsize,forward=True) + #figure.set_size_inches(figsize,forward=True) #figure.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.95) if symmetric: @@ -37,7 +37,7 @@ def plot_2D_images(figure, arr, symmetric=False, pad=None, zoom=None, mode=None, mval = max(abs(arr.max()),abs(arr.min())) arr = arr/(2.*mval)+0.5 else: - minval,maxval = arr.max(),arr.min() + minval,maxval = arr.min(),arr.max() arr = (arr-minval)/(maxval-minval) if mode=='L': diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py index f8bcc9f9..cf457633 100644 --- a/GPy/plotting/matplot_dep/visualize.py +++ b/GPy/plotting/matplot_dep/visualize.py @@ -85,6 +85,7 @@ class vector_show(matplotlib_show): class lvm(matplotlib_show): + def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0,1]): """Visualize a latent variable model @@ -98,7 +99,7 @@ class lvm(matplotlib_show): vals = param_to_array(model.X.mean) else: vals = param_to_array(model.X) - + vals = param_to_array(vals) matplotlib_show.__init__(self, vals, axes=latent_axes) @@ -121,7 +122,7 @@ class lvm(matplotlib_show): self.move_on = False self.latent_index = latent_index self.latent_dim = model.input_dim - + # The red cross which shows current latent point. self.latent_values = vals self.latent_handle = self.latent_axes.plot([0],[0],'rx',mew=2)[0] @@ -130,10 +131,10 @@ class lvm(matplotlib_show): def modify(self, vals): """When latent values are modified update the latent representation and ulso update the output visualization.""" - self.vals = vals.copy() + self.vals = vals[None,:].copy() y = self.model.predict(self.vals)[0] self.data_visualize.modify(y) - self.latent_handle.set_data(self.vals[self.latent_index[0]], self.vals[self.latent_index[1]]) + self.latent_handle.set_data(self.vals[:,self.latent_index[0]], self.vals[:,self.latent_index[1]]) self.axes.figure.canvas.draw()