mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-10 12:32:40 +02:00
rbf gpu usable
This commit is contained in:
parent
e486f3fd99
commit
ca1edecce4
5 changed files with 140 additions and 146 deletions
|
|
@ -66,7 +66,6 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'),
|
'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'),
|
||||||
'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN
|
'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN
|
||||||
'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN
|
'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN
|
||||||
'psi2_t_gpu' :gpuarray.empty((num_inducing*num_inducing*self.batchsize),np.float64,order='F'),
|
|
||||||
# inference_minibatch
|
# inference_minibatch
|
||||||
'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
|
'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
|
||||||
'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
|
'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
|
||||||
|
|
@ -122,6 +121,89 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
else:
|
else:
|
||||||
return jitchol(tdot(Y))
|
return jitchol(tdot(Y))
|
||||||
|
|
||||||
|
def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, het_noise):
|
||||||
|
num_inducing, input_dim = Z.shape[0], Z.shape[1]
|
||||||
|
num_data, output_dim = Y.shape
|
||||||
|
trYYT = self._trYYT
|
||||||
|
psi1Y_gpu = self.gpuCache['psi1Y_gpu']
|
||||||
|
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||||
|
beta_gpu = self.gpuCache['beta_gpu']
|
||||||
|
YT_gpu = self.gpuCache['YT_gpu']
|
||||||
|
betaYT_gpu = self.gpuCache['betaYT_gpu']
|
||||||
|
|
||||||
|
beta_gpu.fill(beta)
|
||||||
|
betaYT_gpu.fill(0.)
|
||||||
|
cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1)
|
||||||
|
YRY_full = trYYT*beta
|
||||||
|
|
||||||
|
if kern.useGPU:
|
||||||
|
psi1Y_gpu.fill(0.)
|
||||||
|
psi2_gpu.fill(0.)
|
||||||
|
psi0_full = 0
|
||||||
|
|
||||||
|
for n_start in xrange(0,num_data,self.batchsize):
|
||||||
|
n_end = min(self.batchsize+n_start, num_data)
|
||||||
|
ndata = n_end - n_start
|
||||||
|
X_slice = X[n_start:n_end]
|
||||||
|
betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
|
||||||
|
|
||||||
|
if uncertain_inputs:
|
||||||
|
psi0 = kern.psi0(Z, X_slice)
|
||||||
|
psi1p_gpu = kern.psi1(Z, X_slice)
|
||||||
|
psi2p_gpu = kern.psi2(Z, X_slice)
|
||||||
|
else:
|
||||||
|
psi0 = kern.Kdiag(X_slice)
|
||||||
|
psi1p_gpu = kern.K(X_slice, Z)
|
||||||
|
|
||||||
|
cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing)
|
||||||
|
|
||||||
|
psi0_full += psi0.sum()
|
||||||
|
|
||||||
|
if uncertain_inputs:
|
||||||
|
sum_axis(psi2_gpu,psi2p_gpu,1,1)
|
||||||
|
else:
|
||||||
|
cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing)
|
||||||
|
|
||||||
|
psi0_full *= beta
|
||||||
|
if uncertain_inputs:
|
||||||
|
cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
psi2_full = np.zeros((num_inducing,num_inducing))
|
||||||
|
psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
|
||||||
|
psi0_full = 0.
|
||||||
|
YRY_full = 0.
|
||||||
|
|
||||||
|
for n_start in xrange(0,num_data,self.batchsize):
|
||||||
|
n_end = min(self.batchsize+n_start, num_data)
|
||||||
|
Y_slice = Y[n_start:n_end]
|
||||||
|
X_slice = X[n_start:n_end]
|
||||||
|
|
||||||
|
if het_noise:
|
||||||
|
b = beta[n_start]
|
||||||
|
YRY_full += np.inner(Y_slice, Y_slice)*b
|
||||||
|
else:
|
||||||
|
b = beta
|
||||||
|
|
||||||
|
if uncertain_inputs:
|
||||||
|
psi0 = kern.psi0(Z, X_slice)
|
||||||
|
psi1 = kern.psi1(Z, X_slice)
|
||||||
|
psi2_full += kern.psi2(Z, X_slice)*b
|
||||||
|
else:
|
||||||
|
psi0 = kern.Kdiag(X_slice)
|
||||||
|
psi1 = kern.K(X_slice, Z)
|
||||||
|
psi2_full += np.dot(psi1.T,psi1)*b
|
||||||
|
|
||||||
|
psi0_full += psi0.sum()*b
|
||||||
|
psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM
|
||||||
|
|
||||||
|
if not het_noise:
|
||||||
|
YRY_full = trYYT*beta
|
||||||
|
psi1Y_gpu.set(psi1Y_full)
|
||||||
|
psi2_gpu.set(psi2_full)
|
||||||
|
|
||||||
|
return psi0_full, YRY_full
|
||||||
|
|
||||||
def inference_likelihood(self, kern, X, Z, likelihood, Y):
|
def inference_likelihood(self, kern, X, Z, likelihood, Y):
|
||||||
"""
|
"""
|
||||||
The first phase of inference:
|
The first phase of inference:
|
||||||
|
|
@ -146,118 +228,10 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
else:
|
else:
|
||||||
uncertain_inputs = False
|
uncertain_inputs = False
|
||||||
|
|
||||||
|
|
||||||
trYYT = self._trYYT
|
|
||||||
|
|
||||||
psi1Y_gpu = self.gpuCache['psi1Y_gpu']
|
psi1Y_gpu = self.gpuCache['psi1Y_gpu']
|
||||||
psi2_gpu = self.gpuCache['psi2_gpu']
|
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||||
beta_gpu = self.gpuCache['beta_gpu']
|
|
||||||
YT_gpu = self.gpuCache['YT_gpu']
|
|
||||||
betaYT_gpu = self.gpuCache['betaYT_gpu']
|
|
||||||
psi2_t_gpu = self.gpuCache['psi2_t_gpu']
|
|
||||||
|
|
||||||
if het_noise:
|
psi0_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs, het_noise)
|
||||||
beta_gpu.set(np.asfortranarray(beta))
|
|
||||||
mul_bcast(betaYT_gpu,beta_gpu,YT_gpu,beta_gpu.size)
|
|
||||||
YRY_full = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, betaYT_gpu.gpudata, 1, YT_gpu.gpudata, 1)
|
|
||||||
else:
|
|
||||||
beta_gpu.fill(beta)
|
|
||||||
betaYT_gpu.fill(0.)
|
|
||||||
cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1)
|
|
||||||
YRY_full = trYYT*beta
|
|
||||||
|
|
||||||
if kern.useGPU:
|
|
||||||
psi1Y_gpu.fill(0.)
|
|
||||||
psi2_gpu.fill(0.)
|
|
||||||
psi0_full = 0
|
|
||||||
|
|
||||||
for n_start in xrange(0,num_data,self.batchsize):
|
|
||||||
n_end = min(self.batchsize+n_start, num_data)
|
|
||||||
ndata = n_end - n_start
|
|
||||||
X_slice = X[n_start:n_end]
|
|
||||||
beta_gpu_slice = beta_gpu[n_start:n_end]
|
|
||||||
betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
|
|
||||||
if ndata==self.batchsize:
|
|
||||||
psi2_t_gpu_slice = psi2_t_gpu
|
|
||||||
else:
|
|
||||||
psi2_t_gpu_slice = psi2_t_gpu[:num_inducing*num_inducing*ndata]
|
|
||||||
if uncertain_inputs:
|
|
||||||
psi0p_gpu = kern.psi0(Z, X_slice)
|
|
||||||
psi1p_gpu = kern.psi1(Z, X_slice)
|
|
||||||
psi2p_gpu = kern.psi2(Z, X_slice)
|
|
||||||
else:
|
|
||||||
psi0p_gpu = kern.Kdiag(X_slice)
|
|
||||||
psi1p_gpu = kern.K(X_slice, Z)
|
|
||||||
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing)
|
|
||||||
|
|
||||||
if het_noise:
|
|
||||||
psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1)
|
|
||||||
else:
|
|
||||||
psi0_full += gpuarray.sum(psi0p_gpu).get()
|
|
||||||
|
|
||||||
if uncertain_inputs:
|
|
||||||
if het_noise:
|
|
||||||
mul_bcast(psi2_t_gpu_slice,beta_gpu_slice,psi2p_gpu,beta_gpu_slice.size)
|
|
||||||
sum_axis(psi2_gpu,psi2_t_gpu_slice,1,ndata)
|
|
||||||
else:
|
|
||||||
sum_axis(psi2_gpu,psi2p_gpu,1,ndata)
|
|
||||||
else:
|
|
||||||
if het_noise:
|
|
||||||
psi1_t_gpu = psi2_t_gpu_slice[:,num_inducing*ndata]
|
|
||||||
mul_bcast(psi1_t_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size)
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, 1.0, psi1p_gpu.gpudata, ndata, psi1_t_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing)
|
|
||||||
else:
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing)
|
|
||||||
|
|
||||||
if not het_noise:
|
|
||||||
psi0_full *= beta
|
|
||||||
if uncertain_inputs:
|
|
||||||
cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1)
|
|
||||||
|
|
||||||
else:
|
|
||||||
psi2_full = np.zeros((num_inducing,num_inducing),order='F')
|
|
||||||
psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD
|
|
||||||
psi0_full = 0
|
|
||||||
|
|
||||||
for n_start in xrange(0,num_data,self.batchsize):
|
|
||||||
n_end = min(self.batchsize+n_start, num_data)
|
|
||||||
Y_slice = Y[n_start:n_end]
|
|
||||||
X_slice = X[n_start:n_end]
|
|
||||||
if uncertain_inputs:
|
|
||||||
psi0 = kern.psi0(Z, X_slice)
|
|
||||||
psi1 = kern.psi1(Z, X_slice)
|
|
||||||
psi2 = kern.psi2(Z, X_slice)
|
|
||||||
else:
|
|
||||||
psi0 = kern.Kdiag(X_slice)
|
|
||||||
psi1 = kern.K(X_slice, Z)
|
|
||||||
|
|
||||||
if het_noise:
|
|
||||||
beta_slice = beta[n_start:n_end]
|
|
||||||
psi0_full += (beta_slice*psi0).sum()
|
|
||||||
psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD
|
|
||||||
else:
|
|
||||||
psi0_full += psi0.sum()
|
|
||||||
psi1Y_full += np.dot(psi1.T,Y_slice) # MxD
|
|
||||||
|
|
||||||
if uncertain_inputs:
|
|
||||||
if het_noise:
|
|
||||||
psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2)
|
|
||||||
else:
|
|
||||||
psi2_full += psi2
|
|
||||||
else:
|
|
||||||
if het_noise:
|
|
||||||
psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1)
|
|
||||||
else:
|
|
||||||
psi2_full += np.outer(psi1.T, psi1.T)
|
|
||||||
|
|
||||||
if not het_noise:
|
|
||||||
psi0_full *= beta
|
|
||||||
psi1Y_full *= beta
|
|
||||||
psi2_full *= beta
|
|
||||||
|
|
||||||
psi1Y_gpu.set(psi1Y_full)
|
|
||||||
psi2_gpu.set(psi2_full)
|
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute Common Components
|
# Compute Common Components
|
||||||
|
|
|
||||||
|
|
@ -73,13 +73,14 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
else:
|
else:
|
||||||
return jitchol(tdot(Y))
|
return jitchol(tdot(Y))
|
||||||
|
|
||||||
def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, het_noise):
|
def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs):
|
||||||
|
|
||||||
num_inducing = Z.shape[0]
|
num_inducing = Z.shape[0]
|
||||||
num_data, output_dim = Y.shape
|
num_data, output_dim = Y.shape
|
||||||
|
|
||||||
if self.batchsize == None:
|
if self.batchsize == None:
|
||||||
self.batchsize = num_data
|
self.batchsize = num_data
|
||||||
|
het_noise = beta.size > 1
|
||||||
|
|
||||||
trYYT = self.get_trYYT(Y)
|
trYYT = self.get_trYYT(Y)
|
||||||
|
|
||||||
|
|
@ -88,46 +89,30 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
psi0_full = 0.
|
psi0_full = 0.
|
||||||
YRY_full = 0.
|
YRY_full = 0.
|
||||||
|
|
||||||
for n_start in xrange(0,num_data,self.batchsize):
|
for n_start in xrange(0,num_data,self.batchsize):
|
||||||
|
|
||||||
n_end = min(self.batchsize+n_start, num_data)
|
n_end = min(self.batchsize+n_start, num_data)
|
||||||
|
|
||||||
Y_slice = Y[n_start:n_end]
|
Y_slice = Y[n_start:n_end]
|
||||||
X_slice = X[n_start:n_end]
|
X_slice = X[n_start:n_end]
|
||||||
|
|
||||||
|
if het_noise:
|
||||||
|
b = beta[n_start]
|
||||||
|
YRY_full += np.inner(Y_slice, Y_slice)*b
|
||||||
|
else:
|
||||||
|
b = beta
|
||||||
|
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
psi0 = kern.psi0(Z, X_slice)
|
psi0 = kern.psi0(Z, X_slice)
|
||||||
psi1 = kern.psi1(Z, X_slice)
|
psi1 = kern.psi1(Z, X_slice)
|
||||||
psi2 = kern.psi2(Z, X_slice)
|
psi2_full += kern.psi2(Z, X_slice)*b
|
||||||
else:
|
else:
|
||||||
psi0 = kern.Kdiag(X_slice)
|
psi0 = kern.Kdiag(X_slice)
|
||||||
psi1 = kern.K(X_slice, Z)
|
psi1 = kern.K(X_slice, Z)
|
||||||
psi2 = None
|
psi2_full += np.dot(psi1.T,psi1)*b
|
||||||
|
|
||||||
if het_noise:
|
|
||||||
beta_slice = beta[n_start:n_end]
|
|
||||||
psi0_full += (beta_slice*psi0).sum()
|
|
||||||
psi1Y_full += np.dot(beta_slice*Y_slice.T,psi1) # DxM
|
|
||||||
YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum()
|
|
||||||
else:
|
|
||||||
psi0_full += psi0.sum()
|
|
||||||
psi1Y_full += np.dot(Y_slice.T,psi1) # DxM
|
|
||||||
|
|
||||||
if uncertain_inputs:
|
|
||||||
if het_noise:
|
|
||||||
psi2_full += beta_slice*psi2
|
|
||||||
else:
|
|
||||||
psi2_full += psi2
|
|
||||||
else:
|
|
||||||
if het_noise:
|
|
||||||
psi2_full += beta_slice*np.outer(psi1,psi1)
|
|
||||||
else:
|
|
||||||
psi2_full += np.dot(psi1.T,psi1)
|
|
||||||
|
|
||||||
|
psi0_full += psi0.sum()*b
|
||||||
|
psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM
|
||||||
|
|
||||||
if not het_noise:
|
if not het_noise:
|
||||||
psi0_full *= beta
|
|
||||||
psi1Y_full *= beta
|
|
||||||
psi2_full *= beta
|
|
||||||
YRY_full = trYYT*beta
|
YRY_full = trYYT*beta
|
||||||
|
|
||||||
if self.mpi_comm != None:
|
if self.mpi_comm != None:
|
||||||
|
|
@ -168,7 +153,7 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
if het_noise:
|
if het_noise:
|
||||||
self.batchsize = 1
|
self.batchsize = 1
|
||||||
|
|
||||||
psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs, het_noise)
|
psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs)
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute Common Components
|
# Compute Common Components
|
||||||
|
|
|
||||||
|
|
@ -248,7 +248,6 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
||||||
'Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'),
|
'Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'),
|
||||||
'mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
'mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
'S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'psi0_gpu' :gpuarray.empty((N,),np.float64,order='F'),
|
|
||||||
'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
|
'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
|
||||||
'psi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'),
|
'psi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'),
|
||||||
'psi2n_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'),
|
'psi2n_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'),
|
||||||
|
|
@ -265,6 +264,10 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
||||||
'grad_mu_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
'grad_mu_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
||||||
'grad_S_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
'grad_S_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
||||||
}
|
}
|
||||||
|
else:
|
||||||
|
assert N==self.gpuCache['mu_gpu'].shape[0]
|
||||||
|
assert M==self.gpuCache['Z_gpu'].shape[0]
|
||||||
|
assert Q==self.gpuCache['l_gpu'].shape[0]
|
||||||
|
|
||||||
def sync_params(self, lengthscale, Z, mu, S):
|
def sync_params(self, lengthscale, Z, mu, S):
|
||||||
if len(lengthscale)==1:
|
if len(lengthscale)==1:
|
||||||
|
|
@ -299,7 +302,6 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
||||||
self._initGPUCache(N,M,Q)
|
self._initGPUCache(N,M,Q)
|
||||||
self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
|
self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
|
||||||
|
|
||||||
psi0_gpu = self.gpuCache['psi0_gpu']
|
|
||||||
psi1_gpu = self.gpuCache['psi1_gpu']
|
psi1_gpu = self.gpuCache['psi1_gpu']
|
||||||
psi2_gpu = self.gpuCache['psi2_gpu']
|
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||||
psi2n_gpu = self.gpuCache['psi2n_gpu']
|
psi2n_gpu = self.gpuCache['psi2n_gpu']
|
||||||
|
|
@ -308,14 +310,15 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
||||||
mu_gpu = self.gpuCache['mu_gpu']
|
mu_gpu = self.gpuCache['mu_gpu']
|
||||||
S_gpu = self.gpuCache['S_gpu']
|
S_gpu = self.gpuCache['S_gpu']
|
||||||
|
|
||||||
psi0_gpu.fill(variance)
|
psi0 = np.empty((N,))
|
||||||
|
psi0[:] = variance
|
||||||
self.g_psi1computations(psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
self.g_psi1computations(psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||||
self.g_psi2computations(psi2_gpu, psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
self.g_psi2computations(psi2_gpu, psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||||
|
|
||||||
if self.GPU_direct:
|
if self.GPU_direct:
|
||||||
return psi0_gpu, psi1_gpu, psi2_gpu
|
return psi0, psi1_gpu, psi2_gpu
|
||||||
else:
|
else:
|
||||||
return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get()
|
return psi0, psi1_gpu.get(), psi2_gpu.get()
|
||||||
|
|
||||||
@Cache_this(limit=1, ignore_args=(0,1,2,3))
|
@Cache_this(limit=1, ignore_args=(0,1,2,3))
|
||||||
def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
||||||
|
|
@ -340,17 +343,19 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
||||||
if self.GPU_direct:
|
if self.GPU_direct:
|
||||||
dL_dpsi1_gpu = dL_dpsi1
|
dL_dpsi1_gpu = dL_dpsi1
|
||||||
dL_dpsi2_gpu = dL_dpsi2
|
dL_dpsi2_gpu = dL_dpsi2
|
||||||
|
dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get()
|
||||||
else:
|
else:
|
||||||
dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
|
dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
|
||||||
dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
|
dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
|
||||||
dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
|
dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
|
||||||
dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
|
dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
|
||||||
|
dL_dpsi0_sum = dL_dpsi0.sum()
|
||||||
|
|
||||||
self.reset_derivative()
|
self.reset_derivative()
|
||||||
self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||||
self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||||
|
|
||||||
dL_dvar = np.sum(dL_dpsi0) + gpuarray.sum(dvar_gpu).get()
|
dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get()
|
||||||
sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
|
sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
|
||||||
dL_dmu = grad_mu_gpu.get()
|
dL_dmu = grad_mu_gpu.get()
|
||||||
sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
|
sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,9 @@ class BayesianGPLVM(SparseGP):
|
||||||
else:
|
else:
|
||||||
from ..inference.latent_function_inference.var_dtc import VarDTC
|
from ..inference.latent_function_inference.var_dtc import VarDTC
|
||||||
inference_method = VarDTC()
|
inference_method = VarDTC()
|
||||||
|
|
||||||
|
if kernel.useGPU and isinstance(inference_method, VarDTC_GPU):
|
||||||
|
kernel.psicomp.GPU_direct = True
|
||||||
|
|
||||||
SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
|
SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
|
||||||
self.add_parameter(self.X, index=0)
|
self.add_parameter(self.X, index=0)
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,9 @@ from ..util import gpu_init
|
||||||
try:
|
try:
|
||||||
from pycuda.reduction import ReductionKernel
|
from pycuda.reduction import ReductionKernel
|
||||||
from pycuda.elementwise import ElementwiseKernel
|
from pycuda.elementwise import ElementwiseKernel
|
||||||
|
import scikits.cuda.linalg as culinalg
|
||||||
|
from scikits.cuda import cublas
|
||||||
|
from scikits.cuda.cula import culaExceptions
|
||||||
|
|
||||||
# log|A| for A is a low triangle matrix
|
# log|A| for A is a low triangle matrix
|
||||||
# logDiagSum(A, A.shape[0]+1)
|
# logDiagSum(A, A.shape[0]+1)
|
||||||
|
|
@ -60,3 +63,27 @@ try:
|
||||||
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
def jitchol(A, L, cublas_handle, maxtries=5):
|
||||||
|
try:
|
||||||
|
cublas.cublasDcopy(cublas_handle, A.size, A.gpudata, 1, L.gpudata, 1)
|
||||||
|
culinalg.cho_factor(L,'L')
|
||||||
|
except culaExceptions:
|
||||||
|
|
||||||
|
|
||||||
|
diagA = np.diag(A)
|
||||||
|
if np.any(diagA <= 0.):
|
||||||
|
raise linalg.LinAlgError, "not pd: non-positive diagonal elements"
|
||||||
|
jitter = diagA.mean() * 1e-6
|
||||||
|
while maxtries > 0 and np.isfinite(jitter):
|
||||||
|
print 'Warning: adding jitter of {:.10e}'.format(jitter)
|
||||||
|
try:
|
||||||
|
return linalg.cholesky(A + np.eye(A.shape[0]).T * jitter, lower=True)
|
||||||
|
except:
|
||||||
|
jitter *= 10
|
||||||
|
finally:
|
||||||
|
maxtries -= 1
|
||||||
|
raise linalg.LinAlgError, "not positive definite, even with jitter."
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue