From 01860455afa0d775a1b8e79039232d5be407e3e9 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Mon, 7 Apr 2014 11:55:46 +0100 Subject: [PATCH] [GPU] add automatic batchsize estimation --- .../latent_function_inference/var_dtc_gpu.py | 73 +++++++++++-------- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 18 ++++- GPy/util/gpu_init.py | 16 ++++ GPy/util/linalg_gpu.py | 4 +- 4 files changed, 75 insertions(+), 36 deletions(-) create mode 100644 GPy/util/gpu_init.py diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index 793d9bf7..a3fe0782 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -9,12 +9,12 @@ import numpy as np from ...util.misc import param_to_array log_2_pi = np.log(2*np.pi) +from ...util import gpu_init +assert gpu_init.initSuccess + try: - import scikits.cuda.linalg as culinalg import pycuda.gpuarray as gpuarray from scikits.cuda import cublas - import pycuda.autoinit - from pycuda.reduction import ReductionKernel from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod except: pass @@ -30,25 +30,24 @@ class VarDTC_GPU(object): """ const_jitter = np.float64(1e-6) - def __init__(self, batchsize=None, limit=1): + def __init__(self, batchsize=None, gpu_memory=4., limit=1): self.batchsize = batchsize + self.gpu_memory = gpu_memory self.midRes = {} self.batch_pos = 0 # the starting position of the current mini-batch - # Initialize GPU environment - culinalg.init() - self.cublas_handle = cublas.cublasCreate() + self.cublas_handle = gpu_init.cublas_handle # Initialize GPU caches self.gpuCache = None - def _initGPUCache(self, num_inducing, output_dim, Y): + def _initGPUCache(self, kern, num_inducing, input_dim, output_dim, Y): + ndata = Y.shape[0] if self.batchsize==None: - self.batchsize = Y.shape[0] + self.batchsize = self._estimateBatchSize(kern, ndata, num_inducing, input_dim, output_dim) if self.gpuCache == None: - ndata = Y.shape[0] self.gpuCache = {# inference_likelihood 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), @@ -83,6 +82,34 @@ class VarDTC_GPU(object): YT_gpu = self.gpuCache['YT_gpu'] self._trYYT = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, YT_gpu.gpudata, 1, YT_gpu.gpudata, 1) + + def _estimateMemoryOccupation(self, N, M, D): + """ + Estimate the best batch size. + N - the number of total datapoints + M - the number of inducing points + D - the number of observed (output) dimensions + return: the constant memory size, the memory occupation of batchsize=1 + unit: GB + """ + return (M+9.*M*M+3*M*D+N+2.*N*D)*8./1024./1024./1024., (4.+3.*M+D+3.*M*M)*8./1024./1024./1024. + + def _estimateBatchSize(self, kern, N, M, Q, D): + """ + Estimate the best batch size. + N - the number of total datapoints + M - the number of inducing points + D - the number of observed (output) dimensions + return: the constant memory size, the memory occupation of batchsize=1 + unit: GB + """ + if kern.useGPU: + x0,x1 = kern.psicomp.estimateMemoryOccupation(N,M,Q) + else: + x0, x1 = 0.,0. + y0, y1 = self._estimateMemoryOccupation(N, M, D) + + return int((self.gpu_memory-y0-x0)/(x1+y1)) def _get_YYTfactor(self, Y): """ @@ -104,10 +131,10 @@ class VarDTC_GPU(object): Cached intermediate results: Kmm, KmmInv, """ - num_inducing = Z.shape[0] + num_inducing, input_dim = Z.shape[0], Z.shape[1] num_data, output_dim = Y.shape - self._initGPUCache(num_inducing, output_dim, Y) + self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y) if isinstance(X, VariationalPosterior): uncertain_inputs = True @@ -238,7 +265,7 @@ class VarDTC_GPU(object): Kmm = kern.K(Z).copy() Kmm_gpu = self.gpuCache['Kmm_gpu'] - Kmm_gpu.set(Kmm) + Kmm_gpu.set(np.asfortranarray(Kmm)) diag.add(Kmm, self.const_jitter) ones_gpu = self.gpuCache['ones_gpu'] cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1) @@ -310,9 +337,7 @@ class VarDTC_GPU(object): cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1) # print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() - - #logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") - + #====================================================================== # Compute log-likelihood #====================================================================== @@ -489,22 +514,6 @@ class VarDTC_GPU(object): mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size) sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) -# if het_noise: -# if uncertain_inputs: -# psiR = np.einsum('mo,nmo->n',dL_dpsi2R,psi2) -# else: -# psiR = np.einsum('nm,no,mo->n',psi1,psi1,dL_dpsi2R) -# -# dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) -# else: -# if uncertain_inputs: -# psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2) -# else: -# psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) -# -# dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum() - - if kern.useGPU: dL_dpsi0 = dL_dpsi0_gpu dL_dpsi1 = dL_dpsi1_gpu diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index a695d14e..8d2f24bc 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -8,11 +8,12 @@ The package for the psi statistics computation on GPU import numpy as np from GPy.util.caching import Cache_this +from ....util import gpu_init +assert gpu_init.initSuccess + try: - import scikits.cuda.linalg as culinalg import pycuda.gpuarray as gpuarray from scikits.cuda import cublas - import pycuda.autoinit from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel from ....util import linalg_gpu @@ -256,7 +257,7 @@ except: class PSICOMP_SSRBF(object): def __init__(self): - self.cublas_handle = cublas.cublasCreate() + self.cublas_handle = gpu_init.cublas_handle self.gpuCache = None self.gpuCacheAll = None @@ -327,6 +328,17 @@ class PSICOMP_SSRBF(object): [v.gpudata.free() for v in self.gpuCacheAll.values()] self.gpuCacheAll = None self.gpuCache = None + + def estimateMemoryOccupation(self, N, M, Q): + """ + Estimate the best batch size. + N - the number of total datapoints + M - the number of inducing points + Q - the number of hidden (input) dimensions + return: the constant memory size, the memory occupation of batchsize=1 + unit: GB + """ + return (2.*Q+2.*M*Q+M*M*Q)*8./1024./1024./1024., (1.+2.*M+10.*Q+2.*M*M+8.*M*Q+7.*M*M*Q)*8./1024./1024./1024. @Cache_this(limit=1,ignore_args=(0,)) def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): diff --git a/GPy/util/gpu_init.py b/GPy/util/gpu_init.py new file mode 100644 index 00000000..917d8158 --- /dev/null +++ b/GPy/util/gpu_init.py @@ -0,0 +1,16 @@ +""" +The package for scikits.cuda initialization + +Global variables: initSuccess +providing CUBLAS handle: cublas_handle +""" + +try: + import pycuda.autoinit + from scikits.cuda import cublas + import scikits.cuda.linalg as culinalg + culinalg.init() + cublas_handle = cublas.cublasCreate() + initSuccess = True +except: + initSuccess = False \ No newline at end of file diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 6062d135..6ec4fb48 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -7,8 +7,10 @@ # import numpy as np +from ..util import gpu_init +assert gpu_init.initSuccess + try: - import pycuda.autoinit from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel