[GPU] add automatic batchsize estimation

This commit is contained in:
Zhenwen Dai 2014-04-07 11:55:46 +01:00
parent bbcba2553c
commit 01860455af
4 changed files with 75 additions and 36 deletions

View file

@ -9,12 +9,12 @@ import numpy as np
from ...util.misc import param_to_array from ...util.misc import param_to_array
log_2_pi = np.log(2*np.pi) log_2_pi = np.log(2*np.pi)
from ...util import gpu_init
assert gpu_init.initSuccess
try: try:
import scikits.cuda.linalg as culinalg
import pycuda.gpuarray as gpuarray import pycuda.gpuarray as gpuarray
from scikits.cuda import cublas from scikits.cuda import cublas
import pycuda.autoinit
from pycuda.reduction import ReductionKernel
from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod
except: except:
pass pass
@ -30,25 +30,24 @@ class VarDTC_GPU(object):
""" """
const_jitter = np.float64(1e-6) const_jitter = np.float64(1e-6)
def __init__(self, batchsize=None, limit=1): def __init__(self, batchsize=None, gpu_memory=4., limit=1):
self.batchsize = batchsize self.batchsize = batchsize
self.gpu_memory = gpu_memory
self.midRes = {} self.midRes = {}
self.batch_pos = 0 # the starting position of the current mini-batch self.batch_pos = 0 # the starting position of the current mini-batch
# Initialize GPU environment self.cublas_handle = gpu_init.cublas_handle
culinalg.init()
self.cublas_handle = cublas.cublasCreate()
# Initialize GPU caches # Initialize GPU caches
self.gpuCache = None self.gpuCache = None
def _initGPUCache(self, num_inducing, output_dim, Y): def _initGPUCache(self, kern, num_inducing, input_dim, output_dim, Y):
ndata = Y.shape[0]
if self.batchsize==None: if self.batchsize==None:
self.batchsize = Y.shape[0] self.batchsize = self._estimateBatchSize(kern, ndata, num_inducing, input_dim, output_dim)
if self.gpuCache == None: if self.gpuCache == None:
ndata = Y.shape[0]
self.gpuCache = {# inference_likelihood self.gpuCache = {# inference_likelihood
'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
@ -83,6 +82,34 @@ class VarDTC_GPU(object):
YT_gpu = self.gpuCache['YT_gpu'] YT_gpu = self.gpuCache['YT_gpu']
self._trYYT = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, YT_gpu.gpudata, 1, YT_gpu.gpudata, 1) self._trYYT = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, YT_gpu.gpudata, 1, YT_gpu.gpudata, 1)
def _estimateMemoryOccupation(self, N, M, D):
"""
Estimate the best batch size.
N - the number of total datapoints
M - the number of inducing points
D - the number of observed (output) dimensions
return: the constant memory size, the memory occupation of batchsize=1
unit: GB
"""
return (M+9.*M*M+3*M*D+N+2.*N*D)*8./1024./1024./1024., (4.+3.*M+D+3.*M*M)*8./1024./1024./1024.
def _estimateBatchSize(self, kern, N, M, Q, D):
"""
Estimate the best batch size.
N - the number of total datapoints
M - the number of inducing points
D - the number of observed (output) dimensions
return: the constant memory size, the memory occupation of batchsize=1
unit: GB
"""
if kern.useGPU:
x0,x1 = kern.psicomp.estimateMemoryOccupation(N,M,Q)
else:
x0, x1 = 0.,0.
y0, y1 = self._estimateMemoryOccupation(N, M, D)
return int((self.gpu_memory-y0-x0)/(x1+y1))
def _get_YYTfactor(self, Y): def _get_YYTfactor(self, Y):
""" """
@ -104,10 +131,10 @@ class VarDTC_GPU(object):
Cached intermediate results: Kmm, KmmInv, Cached intermediate results: Kmm, KmmInv,
""" """
num_inducing = Z.shape[0] num_inducing, input_dim = Z.shape[0], Z.shape[1]
num_data, output_dim = Y.shape num_data, output_dim = Y.shape
self._initGPUCache(num_inducing, output_dim, Y) self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y)
if isinstance(X, VariationalPosterior): if isinstance(X, VariationalPosterior):
uncertain_inputs = True uncertain_inputs = True
@ -238,7 +265,7 @@ class VarDTC_GPU(object):
Kmm = kern.K(Z).copy() Kmm = kern.K(Z).copy()
Kmm_gpu = self.gpuCache['Kmm_gpu'] Kmm_gpu = self.gpuCache['Kmm_gpu']
Kmm_gpu.set(Kmm) Kmm_gpu.set(np.asfortranarray(Kmm))
diag.add(Kmm, self.const_jitter) diag.add(Kmm, self.const_jitter)
ones_gpu = self.gpuCache['ones_gpu'] ones_gpu = self.gpuCache['ones_gpu']
cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1) cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1)
@ -310,9 +337,7 @@ class VarDTC_GPU(object):
cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1)
cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1) cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1)
# print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() # print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max()
#logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step")
#====================================================================== #======================================================================
# Compute log-likelihood # Compute log-likelihood
#====================================================================== #======================================================================
@ -489,22 +514,6 @@ class VarDTC_GPU(object):
mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size) mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size)
sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim) sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
# if het_noise:
# if uncertain_inputs:
# psiR = np.einsum('mo,nmo->n',dL_dpsi2R,psi2)
# else:
# psiR = np.einsum('nm,no,mo->n',psi1,psi1,dL_dpsi2R)
#
# dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
# else:
# if uncertain_inputs:
# psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2)
# else:
# psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
#
# dL_dthetaL = ((np.square(betaY)).sum() + np.square(beta)*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum()
if kern.useGPU: if kern.useGPU:
dL_dpsi0 = dL_dpsi0_gpu dL_dpsi0 = dL_dpsi0_gpu
dL_dpsi1 = dL_dpsi1_gpu dL_dpsi1 = dL_dpsi1_gpu

View file

@ -8,11 +8,12 @@ The package for the psi statistics computation on GPU
import numpy as np import numpy as np
from GPy.util.caching import Cache_this from GPy.util.caching import Cache_this
from ....util import gpu_init
assert gpu_init.initSuccess
try: try:
import scikits.cuda.linalg as culinalg
import pycuda.gpuarray as gpuarray import pycuda.gpuarray as gpuarray
from scikits.cuda import cublas from scikits.cuda import cublas
import pycuda.autoinit
from pycuda.reduction import ReductionKernel from pycuda.reduction import ReductionKernel
from pycuda.elementwise import ElementwiseKernel from pycuda.elementwise import ElementwiseKernel
from ....util import linalg_gpu from ....util import linalg_gpu
@ -256,7 +257,7 @@ except:
class PSICOMP_SSRBF(object): class PSICOMP_SSRBF(object):
def __init__(self): def __init__(self):
self.cublas_handle = cublas.cublasCreate() self.cublas_handle = gpu_init.cublas_handle
self.gpuCache = None self.gpuCache = None
self.gpuCacheAll = None self.gpuCacheAll = None
@ -327,6 +328,17 @@ class PSICOMP_SSRBF(object):
[v.gpudata.free() for v in self.gpuCacheAll.values()] [v.gpudata.free() for v in self.gpuCacheAll.values()]
self.gpuCacheAll = None self.gpuCacheAll = None
self.gpuCache = None self.gpuCache = None
def estimateMemoryOccupation(self, N, M, Q):
"""
Estimate the best batch size.
N - the number of total datapoints
M - the number of inducing points
Q - the number of hidden (input) dimensions
return: the constant memory size, the memory occupation of batchsize=1
unit: GB
"""
return (2.*Q+2.*M*Q+M*M*Q)*8./1024./1024./1024., (1.+2.*M+10.*Q+2.*M*M+8.*M*Q+7.*M*M*Q)*8./1024./1024./1024.
@Cache_this(limit=1,ignore_args=(0,)) @Cache_this(limit=1,ignore_args=(0,))
def psicomputations(self, variance, lengthscale, Z, mu, S, gamma): def psicomputations(self, variance, lengthscale, Z, mu, S, gamma):

16
GPy/util/gpu_init.py Normal file
View file

@ -0,0 +1,16 @@
"""
The package for scikits.cuda initialization
Global variables: initSuccess
providing CUBLAS handle: cublas_handle
"""
try:
import pycuda.autoinit
from scikits.cuda import cublas
import scikits.cuda.linalg as culinalg
culinalg.init()
cublas_handle = cublas.cublasCreate()
initSuccess = True
except:
initSuccess = False

View file

@ -7,8 +7,10 @@
# #
import numpy as np import numpy as np
from ..util import gpu_init
assert gpu_init.initSuccess
try: try:
import pycuda.autoinit
from pycuda.reduction import ReductionKernel from pycuda.reduction import ReductionKernel
from pycuda.elementwise import ElementwiseKernel from pycuda.elementwise import ElementwiseKernel