mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-04-28 22:36:24 +02:00
[GPU] bug fix
This commit is contained in:
parent
f07f66f1f7
commit
bb5c41f64c
3 changed files with 22 additions and 23 deletions
|
|
@ -61,15 +61,15 @@ class VarDTC_GPU(object):
|
||||||
'psi1Y_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'),
|
'psi1Y_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'),
|
||||||
'psi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
|
'psi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
|
||||||
'beta_gpu' :gpuarray.empty((output_dim,),np.float64,order='F'),
|
'beta_gpu' :gpuarray.empty((output_dim,),np.float64,order='F'),
|
||||||
'Y_gpu' :gpuarray.to_gpu(np.asfortranarray(Y)),
|
'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y).T), # DxN
|
||||||
'betaY_gpu' :gpuarray.empty(Y.shape,np.float64,order='F'),
|
'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN
|
||||||
'psi2_t_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'),
|
'psi2_t_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'),
|
||||||
# inference_minibatch
|
# inference_minibatch
|
||||||
}
|
}
|
||||||
self.gpuCache['ones_gpu'].fill(1.0)
|
self.gpuCache['ones_gpu'].fill(1.0)
|
||||||
|
|
||||||
Y_gpu = self.gpuCache['Y_gpu']
|
YT_gpu = self.gpuCache['YT_gpu']
|
||||||
self._trYYT = cublas.cublasDdot(self.cublas_handle, Y_gpu.size, Y_gpu.gpudata, 1, Y_gpu.gpudata, 1)
|
self._trYYT = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, YT_gpu.gpudata, 1, YT_gpu.gpudata, 1)
|
||||||
|
|
||||||
def _get_YYTfactor(self, Y):
|
def _get_YYTfactor(self, Y):
|
||||||
"""
|
"""
|
||||||
|
|
@ -109,32 +109,32 @@ class VarDTC_GPU(object):
|
||||||
psi1Y_gpu = self.gpuCache['psi1Y_gpu']
|
psi1Y_gpu = self.gpuCache['psi1Y_gpu']
|
||||||
psi2_gpu = self.gpuCache['psi2_gpu']
|
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||||
beta_gpu = self.gpuCache['beta_gpu']
|
beta_gpu = self.gpuCache['beta_gpu']
|
||||||
Y_gpu = self.gpuCache['Y_gpu']
|
YT_gpu = self.gpuCache['YT_gpu']
|
||||||
betaY_gpu = self.gpuCache['betaY_gpu']
|
betaYT_gpu = self.gpuCache['betaYT_gpu']
|
||||||
psi2_t_gpu = self.gpuCache['psi2_t_gpu']
|
psi2_t_gpu = self.gpuCache['psi2_t_gpu']
|
||||||
|
|
||||||
if het_noise:
|
if het_noise:
|
||||||
beta_gpu.set(np.asfortranarray(beta))
|
beta_gpu.set(np.asfortranarray(beta))
|
||||||
mul_bcast(betaY_gpu,beta_gpu,Y_gpu,beta_gpu.size)
|
mul_bcast(betaYT_gpu,beta_gpu,YT_gpu,beta_gpu.size)
|
||||||
YRY_full = cublas.cublasDdot(self.cublas_handle, Y_gpu.size, betaY_gpu.gpudata, 1, Y_gpu.gpudata, 1)
|
YRY_full = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, betaYT_gpu.gpudata, 1, YT_gpu.gpudata, 1)
|
||||||
else:
|
else:
|
||||||
beta_gpu.fill(beta)
|
beta_gpu.fill(beta)
|
||||||
betaY_gpu.fill(0.)
|
betaYT_gpu.fill(0.)
|
||||||
cublas.cublasDaxpy(self.cublas_handle, betaY_gpu.size, beta, Y_gpu.gpudata, 1, betaY_gpu.gpudata, 1)
|
cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1)
|
||||||
YRY_full = trYYT*beta
|
YRY_full = trYYT*beta
|
||||||
|
|
||||||
if kern.useGPU:
|
if kern.useGPU:
|
||||||
psi1Y_gpu.fill(0.)
|
psi1Y_gpu.fill(0.)
|
||||||
psi2_gpu.fill(0.)
|
psi2_gpu.fill(0.)
|
||||||
psi0_full = 0
|
psi0_full = 0
|
||||||
|
psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD
|
||||||
|
|
||||||
for n_start in xrange(0,num_data,self.batchsize):
|
for n_start in xrange(0,num_data,self.batchsize):
|
||||||
n_end = min(self.batchsize+n_start, num_data)
|
n_end = min(self.batchsize+n_start, num_data)
|
||||||
ndata = n_end - n_start
|
ndata = n_end - n_start
|
||||||
Y_slice = Y[n_start:n_end]
|
|
||||||
X_slice = X[n_start:n_end]
|
X_slice = X[n_start:n_end]
|
||||||
beta_gpu_slice = beta_gpu[n_start:n_end]
|
beta_gpu_slice = beta_gpu[n_start:n_end]
|
||||||
betaY_gpu_slice = betaY_gpu[n_start:n_end]
|
betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
|
||||||
if ndata==self.batchsize:
|
if ndata==self.batchsize:
|
||||||
psi2_t_gpu_slice = psi2_t_gpu
|
psi2_t_gpu_slice = psi2_t_gpu
|
||||||
else:
|
else:
|
||||||
|
|
@ -147,7 +147,12 @@ class VarDTC_GPU(object):
|
||||||
psi0p_gpu = kern.Kdiag(X_slice)
|
psi0p_gpu = kern.Kdiag(X_slice)
|
||||||
psi1p_gpu = kern.K(X_slice, Z)
|
psi1p_gpu = kern.K(X_slice, Z)
|
||||||
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaY_gpu_slice.gpudata, ndata, 1.0, psi1Y_gpu.gpudata, num_inducing)
|
cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing)
|
||||||
|
psi1Y_full += np.dot(psi1p_gpu.get().T,Y_slice)*beta # MxD
|
||||||
|
# print psi1Y_gpu.get()
|
||||||
|
# print psi1Y_full
|
||||||
|
print np.abs(psi1Y_gpu.get()-psi1Y_full).max()
|
||||||
|
|
||||||
if het_noise:
|
if het_noise:
|
||||||
psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1)
|
psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1)
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
|
|
@ -260,7 +260,7 @@ class PSICOMP_SSRBF(object):
|
||||||
self.gpuCache = None
|
self.gpuCache = None
|
||||||
|
|
||||||
def _initGPUCache(self, N, M, Q):
|
def _initGPUCache(self, N, M, Q):
|
||||||
if self.gpuCache and self.gpuCacheAll['mu_gpu'].shape[0]<N:
|
if self.gpuCache and self.gpuCacheAll['mu_gpu'].shape[0]!=N:
|
||||||
self._releaseMemory()
|
self._releaseMemory()
|
||||||
|
|
||||||
if self.gpuCache == None:
|
if self.gpuCache == None:
|
||||||
|
|
@ -304,13 +304,6 @@ class PSICOMP_SSRBF(object):
|
||||||
'grad_S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
'grad_S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'grad_gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
'grad_gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
}
|
}
|
||||||
nonN_list = ['l_gpu','Z_gpu','psi2exp2_gpu','grad_l_gpu','grad_Z_gpu']
|
|
||||||
self._gpuCache_Nlist = [k for k in self.gpuCacheAll.keys() if k not in nonN_list]
|
|
||||||
self.gpuCache = self.gpuCacheAll
|
|
||||||
elif self.gpuCacheAll['mu_gpu'].shape[0]>N:
|
|
||||||
self.gpuCache = self.gpuCacheAll.copy()
|
|
||||||
for k in self._gpuCache_Nlist:
|
|
||||||
self.gpuCache[k] = self.gpuCacheAll[k][0:N]
|
|
||||||
|
|
||||||
def _releaseMemory(self):
|
def _releaseMemory(self):
|
||||||
if not self.gpuCacheAll:
|
if not self.gpuCacheAll:
|
||||||
|
|
@ -361,7 +354,8 @@ class PSICOMP_SSRBF(object):
|
||||||
comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q)
|
comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q)
|
||||||
comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q)
|
comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q)
|
||||||
|
|
||||||
return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get()
|
# return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get()
|
||||||
|
return psi0_gpu, psi1_gpu, psi2_gpu
|
||||||
|
|
||||||
def _psiDercomputations(self, variance, lengthscale, Z, mu, S, gamma):
|
def _psiDercomputations(self, variance, lengthscale, Z, mu, S, gamma):
|
||||||
"""Compute the derivatives w.r.t. Psi statistics"""
|
"""Compute the derivatives w.r.t. Psi statistics"""
|
||||||
|
|
|
||||||
|
|
@ -28,7 +28,7 @@ try:
|
||||||
# log(1.0-X)
|
# log(1.0-X)
|
||||||
logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element")
|
logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element")
|
||||||
|
|
||||||
# multiplication with broadcast on the last dimension
|
# multiplication with broadcast on the last dimension (out = shorter[:,None]*longer)
|
||||||
mul_bcast = ElementwiseKernel("double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast")
|
mul_bcast = ElementwiseKernel("double *out, double *shorter, double *longer, int shorter_size", "out[i] = longer[i]*shorter[i%shorter_size]", "mul_bcast")
|
||||||
|
|
||||||
# sum through the middle dimension (size_2) of a 3D matrix (size_1, size_2, size_3)
|
# sum through the middle dimension (size_2) of a 3D matrix (size_1, size_2, size_3)
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue