mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-06-05 14:55:15 +02:00
rbf gpu psicomp pass gradcheck
This commit is contained in:
parent
db9b9bc857
commit
e0412ebf54
1 changed files with 134 additions and 32 deletions
|
|
@ -17,6 +17,7 @@ gpu_code = """
|
|||
// define THREADNUM
|
||||
|
||||
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
||||
#define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
|
||||
#define IDX_NQ(n,q) (q*N+n)
|
||||
#define IDX_NM(n,m) (m*N+n)
|
||||
#define IDX_MQ(m,q) (q*M+m)
|
||||
|
|
@ -66,15 +67,17 @@ gpu_code = """
|
|||
double log_psi1 = 0;
|
||||
for(int q=0;q<Q;q++) {
|
||||
double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
|
||||
log_psi1 += (muZ*muZ/(S[IDX_NQ(n,q)]+l[q]))/(-2.);
|
||||
log_psi1 += log(S[IDX_NQ(n,q)]/l[q]+1)/(-2.);
|
||||
double Snq = S[IDX_NQ(n,q)];
|
||||
double lq = l[q]*l[q];
|
||||
log_psi1 += (muZ*muZ/(Snq+lq))/(-2.);
|
||||
log_psi1 += log(Snq/lq+1)/(-2.);
|
||||
}
|
||||
psi1[IDX_NM(n,m)] = var*exp(log_psi1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void psi2computations(double *psi2, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||
__global__ void psi2computations(double *psi2, double *psi2n, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||
{
|
||||
int psi2_idx_start, psi2_idx_end;
|
||||
__shared__ double psi2_local[THREADNUM];
|
||||
|
|
@ -90,10 +93,15 @@ gpu_code = """
|
|||
for(int q=0;q<Q;q++) {
|
||||
double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
|
||||
double muZhat = mu[IDX_NQ(n,q)]- (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.;
|
||||
log_psi2_n += dZ*dZ/(-4.*l[q])-muZhat*muZhat/(2.*S[IDX_NQ(n,q)]+l[q]);
|
||||
log_psi2_n += log(2.*S[IDX_NQ(n,q)]/l[q]+1)/(-2.);
|
||||
double Snq = S[IDX_NQ(n,q)];
|
||||
double lq = l[q]*l[q];
|
||||
log_psi2_n += dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq);
|
||||
log_psi2_n += log(2.*Snq/lq+1)/(-2.);
|
||||
}
|
||||
psi2_local[threadIdx.x] += exp(log_psi2_n);
|
||||
double exp_psi2_n = exp(log_psi2_n);
|
||||
psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
|
||||
if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
|
||||
psi2_local[threadIdx.x] += exp_psi2_n;
|
||||
}
|
||||
__syncthreads();
|
||||
reduce_sum(psi2_local, THREADNUM);
|
||||
|
|
@ -114,25 +122,27 @@ gpu_code = """
|
|||
|
||||
double dvar_local = 0;
|
||||
for(int q=0;q<Q;q++) {
|
||||
double lq_sqrt = l[q];
|
||||
double lq = lq_sqrt*lq_sqrt;
|
||||
double dl_local = 0;
|
||||
for(int p=0;p<P;p++) {
|
||||
int n = p*THREADNUM + threadIdx.x;
|
||||
double dmu_local = 0;
|
||||
double dS_local = 0;
|
||||
double Snq,mu_nq;
|
||||
if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
|
||||
for(int m=m_start; m<m_end; m++) {
|
||||
if(n<N) {
|
||||
double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
|
||||
if(q==0) {dvar_local += lpsi1;}
|
||||
|
||||
double Snq = S[IDX_NQ(n,q)];
|
||||
double lq = l[q];
|
||||
double Zmu = Z[IDX_MQ(m,q)] - mu[IDX_NQ(n,q)];
|
||||
double Zmu = Z[IDX_MQ(m,q)] - mu_nq;
|
||||
double denom = Snq+lq;
|
||||
double Zmu2_denom = Zmu*Zmu/denom;
|
||||
|
||||
dmu_local += lpsi1*Zmu/denom;
|
||||
dS_local += lpsi1*(Zmu2_denom-1.)/denom;
|
||||
dl_local += lpsi1*(Zmu2_denom+Snq/lq)/denom*sqrt(lq);
|
||||
dl_local += lpsi1*(Zmu2_denom+Snq/lq)/denom;
|
||||
g_local[threadIdx.x] = -lpsi1*Zmu/denom;
|
||||
}
|
||||
__syncthreads();
|
||||
|
|
@ -145,7 +155,7 @@ gpu_code = """
|
|||
}
|
||||
__threadfence_block();
|
||||
}
|
||||
g_local[threadIdx.x] = dl_local;
|
||||
g_local[threadIdx.x] = dl_local*lq_sqrt;
|
||||
__syncthreads();
|
||||
reduce_sum(g_local, THREADNUM);
|
||||
if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
|
||||
|
|
@ -155,21 +165,80 @@ gpu_code = """
|
|||
reduce_sum(g_local, THREADNUM);
|
||||
if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}
|
||||
}
|
||||
|
||||
__global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi2, double *psi2n, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||
{
|
||||
int m_start, m_end;
|
||||
__shared__ double g_local[THREADNUM];
|
||||
divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
|
||||
int P = int(ceil(double(N)/THREADNUM));
|
||||
|
||||
double dvar_local = 0;
|
||||
for(int q=0;q<Q;q++) {
|
||||
double lq_sqrt = l[q];
|
||||
double lq = lq_sqrt*lq_sqrt;
|
||||
double dl_local = 0;
|
||||
for(int p=0;p<P;p++) {
|
||||
int n = p*THREADNUM + threadIdx.x;
|
||||
double dmu_local = 0;
|
||||
double dS_local = 0;
|
||||
double Snq,mu_nq;
|
||||
if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
|
||||
for(int m1=m_start; m1<m_end; m1++) {
|
||||
g_local[threadIdx.x] = 0;
|
||||
for(int m2=0;m2<M;m2++) {
|
||||
if(n<N) {
|
||||
double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
|
||||
if(q==0) {dvar_local += lpsi2;}
|
||||
|
||||
double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
|
||||
double muZhat = mu_nq - (Z[IDX_MQ(m1,q)] + Z[IDX_MQ(m2,q)])/2.;
|
||||
double denom = 2.*Snq+lq;
|
||||
double muZhat2_denom = muZhat*muZhat/denom;
|
||||
|
||||
dmu_local += lpsi2*muZhat/denom;
|
||||
dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom;
|
||||
dl_local += lpsi2*((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq));
|
||||
g_local[threadIdx.x] += 2.*lpsi2*(muZhat/denom-dZ/(2*lq));
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
|
||||
if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
|
||||
}
|
||||
if(n<N) {
|
||||
dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
|
||||
dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
|
||||
}
|
||||
__threadfence_block();
|
||||
}
|
||||
g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
|
||||
__syncthreads();
|
||||
reduce_sum(g_local, THREADNUM);
|
||||
if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
|
||||
}
|
||||
g_local[threadIdx.x] = dvar_local;
|
||||
__syncthreads();
|
||||
reduce_sum(g_local, THREADNUM);
|
||||
if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
|
||||
}
|
||||
"""
|
||||
|
||||
class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
||||
|
||||
def __init__(self):
|
||||
def __init__(self, GPU_direct=False):
|
||||
assert gpu_init.initSuccess, "GPU initialization failed!"
|
||||
self.GPU_direct = GPU_direct
|
||||
self.cublas_handle = gpu_init.cublas_handle
|
||||
self.gpuCache = None
|
||||
|
||||
self.threadnum = 128
|
||||
self.blocknum = 3
|
||||
self.blocknum = 15
|
||||
module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
|
||||
self.g_psi1computations = module.get_function('psi1computations')
|
||||
self.g_psi2computations = module.get_function('psi2computations')
|
||||
self.g_psi1compDer = module.get_function('psi1compDer')
|
||||
self.g_psi2compDer = module.get_function('psi2compDer')
|
||||
|
||||
def _initGPUCache(self, N, M, Q):
|
||||
if self.gpuCache == None:
|
||||
|
|
@ -181,6 +250,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
|||
'psi0_gpu' :gpuarray.empty((N,),np.float64,order='F'),
|
||||
'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
|
||||
'psi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'),
|
||||
'psi2n_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'),
|
||||
# derivatives
|
||||
'dvar_gpu' :gpuarray.empty((self.blocknum,),np.float64, order='F'),
|
||||
'dl_gpu' :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
|
||||
|
|
@ -227,6 +297,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
|||
psi0_gpu = self.gpuCache['psi0_gpu']
|
||||
psi1_gpu = self.gpuCache['psi1_gpu']
|
||||
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||
psi2n_gpu = self.gpuCache['psi2n_gpu']
|
||||
l_gpu = self.gpuCache['l_gpu']
|
||||
Z_gpu = self.gpuCache['Z_gpu']
|
||||
mu_gpu = self.gpuCache['mu_gpu']
|
||||
|
|
@ -234,7 +305,11 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
|||
|
||||
psi0_gpu.fill(variance)
|
||||
self.g_psi1computations(psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||
self.g_psi2computations(psi2_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||
self.g_psi2computations(psi2_gpu, psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||
|
||||
if self.GPU_direct:
|
||||
return psi0_gpu, psi1_gpu, psi2_gpu
|
||||
else:
|
||||
return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get()
|
||||
|
||||
psi0 = np.empty(mu.shape[0])
|
||||
|
|
@ -246,7 +321,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
|||
print np.abs(psi1-psi1g).max()
|
||||
|
||||
psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
|
||||
self.g_psi2computations(psi2_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||
self.g_psi2computations(psi2_gpu, psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||
psi2g = psi2_gpu.get()
|
||||
print np.abs(psi2-psi2g).max()
|
||||
|
||||
|
|
@ -264,9 +339,8 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
|||
N = mu.shape[0]
|
||||
M = Z.shape[0]
|
||||
Q = Z.shape[1]
|
||||
psi0_gpu = self.gpuCache['psi0_gpu']
|
||||
psi1_gpu = self.gpuCache['psi1_gpu']
|
||||
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||
psi2n_gpu = self.gpuCache['psi2n_gpu']
|
||||
l_gpu = self.gpuCache['l_gpu']
|
||||
Z_gpu = self.gpuCache['Z_gpu']
|
||||
mu_gpu = self.gpuCache['mu_gpu']
|
||||
|
|
@ -277,22 +351,50 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
|||
dmu_gpu = self.gpuCache['dmu_gpu']
|
||||
dS_gpu = self.gpuCache['dS_gpu']
|
||||
|
||||
if self.GPU_direct:
|
||||
dL_dpsi1_gpu = dL_dpsi1
|
||||
dL_dpsi2_gpu = dL_dpsi2
|
||||
else:
|
||||
dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1))
|
||||
dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2))
|
||||
|
||||
self.reset_derivative()
|
||||
self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||
dZg = dZ_gpu.get()
|
||||
print dZ_psi1, dZg
|
||||
# print np.abs(dvar_psi1-dvarg).max()
|
||||
# print np.abs(dvar_psi1-dvar_gpu.get().sum(axis=-1)).max()
|
||||
# print np.abs(dl_psi1-dl_gpu.get().sum(axis=-1)).max()
|
||||
# print np.abs(dmu_psi1-dmu_gpu.get().sum(axis=-1)).max()
|
||||
# print np.abs(dS_psi1-dS_gpu.get().sum(axis=-1)).max()
|
||||
# print np.abs(dZ_psi1-dZ_gpu.get()).max()
|
||||
|
||||
dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
|
||||
# self.reset_derivative()
|
||||
self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
|
||||
# print np.abs(dvar_psi2-dvar_gpu.get().sum(axis=-1)).max()
|
||||
# print np.abs(dl_psi2-dl_gpu.get().sum(axis=-1)).max()
|
||||
# print np.abs(dmu_psi2-dmu_gpu.get().sum(axis=-1)).max()
|
||||
# print np.abs(dS_psi2-dS_gpu.get().sum(axis=-1)).max()
|
||||
# print np.abs(dZ_psi2-dZ_gpu.get()).max()
|
||||
|
||||
dL_dlengscale = dl_psi1 + dl_psi2
|
||||
if not ARD:
|
||||
dL_dlengscale = dL_dlengscale.sum()
|
||||
dL_dvar = np.sum(dL_dpsi0) + dvar_gpu.get().sum()
|
||||
dL_dmu = dmu_gpu.get().sum(axis=-1)
|
||||
dL_dS = dS_gpu.get().sum(axis=-1)
|
||||
dL_dZ = dZ_gpu.get()
|
||||
if ARD:
|
||||
dL_dlengscale = dl_gpu.get().sum(axis=-1)
|
||||
else:
|
||||
dL_dlengscale = dl_gpu.get().sum()
|
||||
|
||||
dL_dmu = dmu_psi1 + dmu_psi2
|
||||
dL_dS = dS_psi1 + dS_psi2
|
||||
dL_dZ = dZ_psi1 + dZ_psi2
|
||||
# print np.abs(dL_dlengscale - dl_psi1-dl_psi2).max()
|
||||
|
||||
#
|
||||
# dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
|
||||
#
|
||||
# dL_dlengscale = dl_psi1 + dl_psi2
|
||||
# if not ARD:
|
||||
# dL_dlengscale = dL_dlengscale.sum()
|
||||
#
|
||||
# dL_dmu = dmu_psi1 + dmu_psi2
|
||||
# dL_dS = dS_psi1 + dS_psi2
|
||||
# dL_dZ = dZ_psi1 + dZ_psi2
|
||||
|
||||
return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue