[GPU] varDTC_gpu ready

This commit is contained in:
Zhenwen Dai 2014-05-09 16:12:16 +01:00
parent ddbf15d763
commit a702b0862b
4 changed files with 52 additions and 113 deletions

View file

@ -70,10 +70,6 @@ class VarDTC_GPU(object):
'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), 'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), 'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
'dL_dpsi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'dL_dpsi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
'dL_dthetaL_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
'betapsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
'thetaL_t_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
'betaYT2_gpu' :gpuarray.empty((output_dim,self.batchsize),np.float64,order='F'),
'psi0p_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'), 'psi0p_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
'psi1p_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'), 'psi1p_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
'psi2p_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), 'psi2p_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
@ -377,12 +373,11 @@ class VarDTC_GPU(object):
# Compute dL_dthetaL for uncertian input and non-heter noise # Compute dL_dthetaL for uncertian input and non-heter noise
#====================================================================== #======================================================================
if uncertain_inputs and not het_noise: if not het_noise:
dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. dL_dthetaL = (YRY_full + output_dim*psi0_full - num_data*output_dim)/-2.
dL_dthetaL += -beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1) dL_dthetaL += cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1)
dL_dthetaL += -beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1) dL_dthetaL += cublas.cublasDdot(self.cublas_handle,v_gpu.size, v_gpu.gpudata,1,psi1Y_gpu.gpudata,1)
dL_dthetaL += traceDot(v_gpu, psi1Y_gpu, num_inducing, output_dim) self.midRes['dL_dthetaL'] = -beta*dL_dthetaL
self.midRes['dL_dthetaL'] = dL_dthetaL
return logL, dL_dKmm_gpu.get(), post return logL, dL_dKmm_gpu.get(), post
@ -419,22 +414,22 @@ class VarDTC_GPU(object):
beta = beta[n_start] # nSlice==1 beta = beta[n_start] # nSlice==1
if kern.useGPU: if kern.useGPU:
if uncertain_inputs: if not uncertain_inputs:
psi0p_gpu = kern.psi0(Z, X_slice)
psi1p_gpu = kern.psi1(Z, X_slice)
psi2p_gpu = kern.psi2(Z, X_slice)
else:
psi0p_gpu = kern.Kdiag(X_slice) psi0p_gpu = kern.Kdiag(X_slice)
psi1p_gpu = kern.K(X_slice, Z) psi1p_gpu = kern.K(X_slice, Z)
psi2p_gpu = self.gpuCache['psi2p_gpu'] psi2p_gpu = self.gpuCache['psi2p_gpu']
else: elif het_noise:
if uncertain_inputs: psi0p_gpu = kern.psi0(Z, X_slice)
psi1p_gpu = kern.psi1(Z, X_slice)
psi2p_gpu = kern.psi2(Z, X_slice)
elif not uncertain_inputs or het_noise:
if not uncertain_inputs:
psi0 = kern.Kdiag(X_slice)
psi1 = kern.K(X_slice, Z)
elif het_noise:
psi0 = kern.psi0(Z, X_slice) psi0 = kern.psi0(Z, X_slice)
psi1 = kern.psi1(Z, X_slice) psi1 = kern.psi1(Z, X_slice)
psi2 = kern.psi2(Z, X_slice) psi2 = kern.psi2(Z, X_slice)
else:
psi0 = kern.Kdiag(X_slice)
psi1 = kern.K(X_slice, Z)
psi0p_gpu = self.gpuCache['psi0p_gpu'] psi0p_gpu = self.gpuCache['psi0p_gpu']
psi1p_gpu = self.gpuCache['psi1p_gpu'] psi1p_gpu = self.gpuCache['psi1p_gpu']
@ -448,41 +443,21 @@ class VarDTC_GPU(object):
psi2p_gpu.set(np.asfortranarray(psi2)) psi2p_gpu.set(np.asfortranarray(psi2))
#====================================================================== #======================================================================
# Prepare gpu memory # Compute dL_dpsi
#====================================================================== #======================================================================
dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu']
v_gpu = self.gpuCache['v_gpu'] v_gpu = self.gpuCache['v_gpu']
betaYT_gpu = self.gpuCache['betaYT_gpu']
beta_gpu = self.gpuCache['beta_gpu']
dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu'] dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu']
dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu'] betaYT_gpu = self.gpuCache['betaYT_gpu']
psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
betapsi1_gpu = self.gpuCache['betapsi1_gpu']
thetaL_t_gpu = self.gpuCache['thetaL_t_gpu']
betaYT2_gpu = self.gpuCache['betaYT2_gpu']
betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
beta_gpu_slice = beta_gpu[n_start:n_end]
# Adjust to the batch size # Adjust to the batch size
if dL_dpsi0_gpu.shape[0] > nSlice: if dL_dpsi0_gpu.shape[0] > nSlice:
betaYT2_gpu = betaYT2_gpu[:,:nSlice]
dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice] dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice]
dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
dL_dpsi2_gpu = dL_dpsi2_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
dL_dthetaL_gpu = dL_dthetaL_gpu.ravel()[:nSlice]
psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice]
betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size)
#======================================================================
# Compute dL_dpsi
#======================================================================
dL_dpsi0_gpu.fill(-output_dim *beta/2.) dL_dpsi0_gpu.fill(-output_dim *beta/2.)
@ -490,52 +465,18 @@ class VarDTC_GPU(object):
if uncertain_inputs: if uncertain_inputs:
cublas.cublasDcopy(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, dL_dpsi2_gpu.gpudata, 1) cublas.cublasDcopy(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, dL_dpsi2_gpu.gpudata, 1)
cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.szie, beta, dL_dpsi2_gpu.gpudata, 1) cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.size, beta, dL_dpsi2_gpu.gpudata, 1)
else: else:
cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice) cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, beta, psi1p_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice)
#====================================================================== #======================================================================
# Compute dL_dthetaL # Compute dL_dthetaL
#====================================================================== #======================================================================
if het_noise:
if uncertain_inputs and not het_noise: betaY = betaYT_gpu_slice.get()
if isEnd: dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0p_gpu.get())-output_dim*beta)/2.
dL_dthetaL = self.midRes['dL_dthetaL'] dL_dthetaL += -beta*beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2p_gpu.gpudata,1)
else: dL_dthetaL += -beta*(betaY*np.dot(psi1p_gpu.get(),v_gpu.get())).sum(axis=-1)
dL_dthetaL = 0
# if not uncertain_inputs:
# cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, nSlice, beta, psi1p_gpu.gpudata, nSlice, psi1p_gpu.gpudata, nSlice, 1.0, psi2p_gpu.gpudata, num_inducing)
# join_prod(psi2p_gpu,psi1p_gpu,psi1p_gpu,nSlice,num_inducing)
# psiR = cublas.cublasDdot(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, psi2p_gpu.gpudata, 1)
# # mul_bcast_first(psi2R_gpu,dL_dpsi2R_gpu,psi2p_gpu,nSlice)
#
# dL_dthetaL_gpu.fill(0.)
# dL_dthetaL = 0.
#
# dL_dthetaL += cublas.cublasDdot(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata,1,betaYT_gpu_slice.gpudata,1)
#
# cublas.cublasDcopy(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata, 1, betaYT2_gpu.gpudata, 1)
# mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT2_gpu,betaYT2_gpu.size)
# cublas.cublasDscal(self.cublas_handle, betaYT2_gpu.size, 0.5, betaYT2_gpu.gpudata, 1)
# sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
#
# cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/(-2.0), beta_gpu_slice.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
# cublas.cublasDcopy(self.cublas_handle, beta_gpu_slice.size, beta_gpu_slice.gpudata, 1, thetaL_t_gpu.gpudata, 1)
# mul_bcast(thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu.size)
# mul_bcast(thetaL_t_gpu,thetaL_t_gpu,psi0p_gpu,thetaL_t_gpu.size)
# cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/2.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
#
# thetaL_t_gpu.fill(0.)
# sum_axis(thetaL_t_gpu, psi2R_gpu, nSlice, num_inducing*num_inducing)
# mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
# mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
# cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
#
# cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, -1.0, v_gpu.gpudata, num_inducing, betapsi1_gpu.gpudata, nSlice, 0.0, betaYT2_gpu.gpudata, output_dim)
# mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size)
# sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
if kern.useGPU: if kern.useGPU:
dL_dpsi0 = dL_dpsi0_gpu dL_dpsi0 = dL_dpsi0_gpu
@ -548,10 +489,11 @@ class VarDTC_GPU(object):
dL_dpsi2 = dL_dpsi2_gpu dL_dpsi2 = dL_dpsi2_gpu
else: else:
dL_dpsi2 = dL_dpsi2_gpu.get() dL_dpsi2 = dL_dpsi2_gpu.get()
if het_noise: if not het_noise:
dL_dthetaL = dL_dthetaL_gpu.get() if isEnd:
else: dL_dthetaL = self.midRes['dL_dthetaL']
dL_dthetaL = gpuarray.sum(dL_dthetaL_gpu).get() else:
dL_dthetaL = 0.
if uncertain_inputs: if uncertain_inputs:
grad_dict = {'dL_dpsi0':dL_dpsi0, grad_dict = {'dL_dpsi0':dL_dpsi0,
'dL_dpsi1':dL_dpsi1, 'dL_dpsi1':dL_dpsi1,

View file

@ -116,7 +116,7 @@ class VarDTC_minibatch(object):
if het_noise: if het_noise:
psi2_full += beta_slice*np.outer(psi1,psi1) psi2_full += beta_slice*np.outer(psi1,psi1)
else: else:
psi2_full += np.outer(psi1.T,psi1.T) psi2_full += np.dot(psi1.T,psi1)
if not het_noise: if not het_noise:
psi0_full *= beta psi0_full *= beta
@ -176,7 +176,7 @@ class VarDTC_minibatch(object):
# Compute dL_dthetaL for uncertian input and non-heter noise # Compute dL_dthetaL for uncertian input and non-heter noise
#====================================================================== #======================================================================
if uncertain_inputs and not het_noise: if not het_noise:
dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum() dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
self.midRes['dL_dthetaL'] = dL_dthetaL self.midRes['dL_dthetaL'] = dL_dthetaL
@ -265,14 +265,10 @@ class VarDTC_minibatch(object):
dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1) dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
else: else:
if uncertain_inputs: if isEnd:
if isEnd: dL_dthetaL = self.midRes['dL_dthetaL']
dL_dthetaL = self.midRes['dL_dthetaL']
else:
dL_dthetaL = 0.
else: else:
psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R) dL_dthetaL = 0.
dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum()
if uncertain_inputs: if uncertain_inputs:
grad_dict = {'dL_dpsi0':dL_dpsi0, grad_dict = {'dL_dpsi0':dL_dpsi0,

View file

@ -42,7 +42,8 @@ class SSGPLVM(SparseGP):
X_variance = np.random.uniform(0,.1,X.shape) X_variance = np.random.uniform(0,.1,X.shape)
gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim) #gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
gamma[:] = 0.5
if group_spike: if group_spike:
gamma[:] = gamma.mean(axis=0) gamma[:] = gamma.mean(axis=0)

View file

@ -19,8 +19,8 @@ try:
strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step") strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step")
# np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - n x m, B - m x n # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - a1 x a2, B - a2 x a1
traceDot = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%n)*m+i/n]", arguments="double *A, double *B, int n, int m") traceDot = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%a1)*a2+i/a1]", arguments="double *A, double *B, int a1, int a2")
#======================================================================================= #=======================================================================================
# Element-wise functions # Element-wise functions