rbf gpu psicomp pass gradcheck

This commit is contained in:
Zhenwen Dai 2014-06-20 14:09:26 +01:00
parent db9b9bc857
commit e0412ebf54

View file

@ -17,6 +17,7 @@ gpu_code = """
// define THREADNUM
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
#define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
#define IDX_NQ(n,q) (q*N+n)
#define IDX_NM(n,m) (m*N+n)
#define IDX_MQ(m,q) (q*M+m)
@ -66,15 +67,17 @@ gpu_code = """
double log_psi1 = 0;
for(int q=0;q<Q;q++) {
double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
log_psi1 += (muZ*muZ/(S[IDX_NQ(n,q)]+l[q]))/(-2.);
log_psi1 += log(S[IDX_NQ(n,q)]/l[q]+1)/(-2.);
double Snq = S[IDX_NQ(n,q)];
double lq = l[q]*l[q];
log_psi1 += (muZ*muZ/(Snq+lq))/(-2.);
log_psi1 += log(Snq/lq+1)/(-2.);
}
psi1[IDX_NM(n,m)] = var*exp(log_psi1);
}
}
}
__global__ void psi2computations(double *psi2, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
__global__ void psi2computations(double *psi2, double *psi2n, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
{
int psi2_idx_start, psi2_idx_end;
__shared__ double psi2_local[THREADNUM];
@ -90,10 +93,15 @@ gpu_code = """
for(int q=0;q<Q;q++) {
double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
double muZhat = mu[IDX_NQ(n,q)]- (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.;
log_psi2_n += dZ*dZ/(-4.*l[q])-muZhat*muZhat/(2.*S[IDX_NQ(n,q)]+l[q]);
log_psi2_n += log(2.*S[IDX_NQ(n,q)]/l[q]+1)/(-2.);
double Snq = S[IDX_NQ(n,q)];
double lq = l[q]*l[q];
log_psi2_n += dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq);
log_psi2_n += log(2.*Snq/lq+1)/(-2.);
}
psi2_local[threadIdx.x] += exp(log_psi2_n);
double exp_psi2_n = exp(log_psi2_n);
psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
psi2_local[threadIdx.x] += exp_psi2_n;
}
__syncthreads();
reduce_sum(psi2_local, THREADNUM);
@ -114,25 +122,27 @@ gpu_code = """
double dvar_local = 0;
for(int q=0;q<Q;q++) {
double lq_sqrt = l[q];
double lq = lq_sqrt*lq_sqrt;
double dl_local = 0;
for(int p=0;p<P;p++) {
int n = p*THREADNUM + threadIdx.x;
double dmu_local = 0;
double dS_local = 0;
double Snq,mu_nq;
if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
for(int m=m_start; m<m_end; m++) {
if(n<N) {
double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
if(q==0) {dvar_local += lpsi1;}
double Snq = S[IDX_NQ(n,q)];
double lq = l[q];
double Zmu = Z[IDX_MQ(m,q)] - mu[IDX_NQ(n,q)];
double Zmu = Z[IDX_MQ(m,q)] - mu_nq;
double denom = Snq+lq;
double Zmu2_denom = Zmu*Zmu/denom;
dmu_local += lpsi1*Zmu/denom;
dS_local += lpsi1*(Zmu2_denom-1.)/denom;
dl_local += lpsi1*(Zmu2_denom+Snq/lq)/denom*sqrt(lq);
dl_local += lpsi1*(Zmu2_denom+Snq/lq)/denom;
g_local[threadIdx.x] = -lpsi1*Zmu/denom;
}
__syncthreads();
@ -145,7 +155,7 @@ gpu_code = """
}
__threadfence_block();
}
g_local[threadIdx.x] = dl_local;
g_local[threadIdx.x] = dl_local*lq_sqrt;
__syncthreads();
reduce_sum(g_local, THREADNUM);
if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
@ -155,21 +165,80 @@ gpu_code = """
reduce_sum(g_local, THREADNUM);
if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}
}
__global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi2, double *psi2n, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
{
int m_start, m_end;
__shared__ double g_local[THREADNUM];
divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
int P = int(ceil(double(N)/THREADNUM));
double dvar_local = 0;
for(int q=0;q<Q;q++) {
double lq_sqrt = l[q];
double lq = lq_sqrt*lq_sqrt;
double dl_local = 0;
for(int p=0;p<P;p++) {
int n = p*THREADNUM + threadIdx.x;
double dmu_local = 0;
double dS_local = 0;
double Snq,mu_nq;
if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
for(int m1=m_start; m1<m_end; m1++) {
g_local[threadIdx.x] = 0;
for(int m2=0;m2<M;m2++) {
if(n<N) {
double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
if(q==0) {dvar_local += lpsi2;}
double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
double muZhat = mu_nq - (Z[IDX_MQ(m1,q)] + Z[IDX_MQ(m2,q)])/2.;
double denom = 2.*Snq+lq;
double muZhat2_denom = muZhat*muZhat/denom;
dmu_local += lpsi2*muZhat/denom;
dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom;
dl_local += lpsi2*((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq));
g_local[threadIdx.x] += 2.*lpsi2*(muZhat/denom-dZ/(2*lq));
}
}
__syncthreads();
reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
}
if(n<N) {
dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
}
__threadfence_block();
}
g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
__syncthreads();
reduce_sum(g_local, THREADNUM);
if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
}
g_local[threadIdx.x] = dvar_local;
__syncthreads();
reduce_sum(g_local, THREADNUM);
if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
}
"""
class PSICOMP_RBF_GPU(PSICOMP_RBF):
def __init__(self):
def __init__(self, GPU_direct=False):
assert gpu_init.initSuccess, "GPU initialization failed!"
self.GPU_direct = GPU_direct
self.cublas_handle = gpu_init.cublas_handle
self.gpuCache = None
self.threadnum = 128
self.blocknum = 3
self.blocknum = 15
module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
self.g_psi1computations = module.get_function('psi1computations')
self.g_psi2computations = module.get_function('psi2computations')
self.g_psi1compDer = module.get_function('psi1compDer')
self.g_psi2compDer = module.get_function('psi2compDer')
def _initGPUCache(self, N, M, Q):
if self.gpuCache == None:
@ -181,6 +250,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
'psi0_gpu' :gpuarray.empty((N,),np.float64,order='F'),
'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
'psi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'),
'psi2n_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'),
# derivatives
'dvar_gpu' :gpuarray.empty((self.blocknum,),np.float64, order='F'),
'dl_gpu' :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
@ -227,6 +297,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
psi0_gpu = self.gpuCache['psi0_gpu']
psi1_gpu = self.gpuCache['psi1_gpu']
psi2_gpu = self.gpuCache['psi2_gpu']
psi2n_gpu = self.gpuCache['psi2n_gpu']
l_gpu = self.gpuCache['l_gpu']
Z_gpu = self.gpuCache['Z_gpu']
mu_gpu = self.gpuCache['mu_gpu']
@ -234,8 +305,12 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
psi0_gpu.fill(variance)
self.g_psi1computations(psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
self.g_psi2computations(psi2_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get()
self.g_psi2computations(psi2_gpu, psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
if self.GPU_direct:
return psi0_gpu, psi1_gpu, psi2_gpu
else:
return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get()
psi0 = np.empty(mu.shape[0])
psi0[:] = variance
@ -246,7 +321,7 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
print np.abs(psi1-psi1g).max()
psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
self.g_psi2computations(psi2_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
self.g_psi2computations(psi2_gpu, psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
psi2g = psi2_gpu.get()
print np.abs(psi2-psi2g).max()
@ -264,9 +339,8 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
N = mu.shape[0]
M = Z.shape[0]
Q = Z.shape[1]
psi0_gpu = self.gpuCache['psi0_gpu']
psi1_gpu = self.gpuCache['psi1_gpu']
psi2_gpu = self.gpuCache['psi2_gpu']
psi2n_gpu = self.gpuCache['psi2n_gpu']
l_gpu = self.gpuCache['l_gpu']
Z_gpu = self.gpuCache['Z_gpu']
mu_gpu = self.gpuCache['mu_gpu']
@ -277,22 +351,50 @@ class PSICOMP_RBF_GPU(PSICOMP_RBF):
dmu_gpu = self.gpuCache['dmu_gpu']
dS_gpu = self.gpuCache['dS_gpu']
dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1))
self.reset_derivative()
if self.GPU_direct:
dL_dpsi1_gpu = dL_dpsi1
dL_dpsi2_gpu = dL_dpsi2
else:
dL_dpsi1_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi1))
dL_dpsi2_gpu = gpuarray.to_gpu(np.asfortranarray(dL_dpsi2))
self.reset_derivative()
self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
dZg = dZ_gpu.get()
print dZ_psi1, dZg
# print np.abs(dvar_psi1-dvarg).max()
dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
# print np.abs(dvar_psi1-dvar_gpu.get().sum(axis=-1)).max()
# print np.abs(dl_psi1-dl_gpu.get().sum(axis=-1)).max()
# print np.abs(dmu_psi1-dmu_gpu.get().sum(axis=-1)).max()
# print np.abs(dS_psi1-dS_gpu.get().sum(axis=-1)).max()
# print np.abs(dZ_psi1-dZ_gpu.get()).max()
# self.reset_derivative()
self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1))
# print np.abs(dvar_psi2-dvar_gpu.get().sum(axis=-1)).max()
# print np.abs(dl_psi2-dl_gpu.get().sum(axis=-1)).max()
# print np.abs(dmu_psi2-dmu_gpu.get().sum(axis=-1)).max()
# print np.abs(dS_psi2-dS_gpu.get().sum(axis=-1)).max()
# print np.abs(dZ_psi2-dZ_gpu.get()).max()
dL_dvar = np.sum(dL_dpsi0) + dvar_gpu.get().sum()
dL_dmu = dmu_gpu.get().sum(axis=-1)
dL_dS = dS_gpu.get().sum(axis=-1)
dL_dZ = dZ_gpu.get()
if ARD:
dL_dlengscale = dl_gpu.get().sum(axis=-1)
else:
dL_dlengscale = dl_gpu.get().sum()
# print np.abs(dL_dlengscale - dl_psi1-dl_psi2).max()
dL_dlengscale = dl_psi1 + dl_psi2
if not ARD:
dL_dlengscale = dL_dlengscale.sum()
dL_dmu = dmu_psi1 + dmu_psi2
dL_dS = dS_psi1 + dS_psi2
dL_dZ = dZ_psi1 + dZ_psi2
#
# dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
#
# dL_dlengscale = dl_psi1 + dl_psi2
# if not ARD:
# dL_dlengscale = dL_dlengscale.sum()
#
# dL_dmu = dmu_psi1 + dmu_psi2
# dL_dS = dS_psi1 + dS_psi2
# dL_dZ = dZ_psi1 + dZ_psi2
return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS