From bc59cb8b225597df9e2d23294498e92e9768dbaf Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Wed, 26 Mar 2014 17:09:01 +0000 Subject: [PATCH] [GPU] psi1 after debug --- .../latent_function_inference/var_dtc_gpu.py | 28 ++-- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 152 +++++++++++++++--- GPy/kern/_src/rbf.py | 2 +- GPy/models/ss_gplvm.py | 11 ++ GPy/util/linalg_gpu.py | 4 +- 5 files changed, 157 insertions(+), 40 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc_gpu.py b/GPy/inference/latent_function_inference/var_dtc_gpu.py index ba7ec602..75a07992 100644 --- a/GPy/inference/latent_function_inference/var_dtc_gpu.py +++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py @@ -52,17 +52,17 @@ class VarDTC_GPU(object): def _initGPUCache(self, num_inducing, output_dim): if self.gpuCache == None: self.gpuCache = {# inference_likelihood - 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'ones_gpu' :gpuarray.empty(num_inducing, np.float64), - 'LL_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'b_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64), - 'v_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64), - 'vvt_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'KmmInvPsi2LLInvT_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'KmmInvPsi2P_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'dL_dpsi2R_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), - 'dL_dKmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64), + 'Kmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'Lm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'ones_gpu' :gpuarray.empty(num_inducing, np.float64,order='F'), + 'LL_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'b_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'), + 'v_gpu' :gpuarray.empty((num_inducing,output_dim),np.float64,order='F'), + 'vvt_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'KmmInvPsi2LLInvT_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'KmmInvPsi2P_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'dL_dpsi2R_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), + 'dL_dKmm_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'), # inference_minibatch } self.gpuCache['ones_gpu'].fill(1.0) @@ -134,11 +134,11 @@ class VarDTC_GPU(object): if het_noise: beta_slice = beta[n_start:n_end] psi0_full += (beta_slice*psi0).sum() - psi1Y_full += np.dot(psi1,beta_slice[:,None]*Y_slice) # DxM + psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum() else: psi0_full += psi0.sum() - psi1Y_full += np.dot(psi1,Y_slice) # DxM + psi1Y_full += np.dot(psi1.T,Y_slice) # MxD if uncertain_inputs: @@ -275,7 +275,7 @@ class VarDTC_GPU(object): # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== - post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm.get()) + post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get()) return logL, dL_dKmm, post diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 2acddae9..467b779d 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -13,32 +13,118 @@ try: import pycuda.gpuarray as gpuarray from scikits.cuda import cublas import pycuda.autoinit - from pycuda.reduction import ReductionKernel - from ...util.linalg_gpu import logDiagSum - + from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel # The kernel form computing psi1 comp_psi1 = ElementwiseKernel( - "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", - "psi1[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, psi1denom, N, M, Q, i)", + "double *psi1, double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", + "psi1[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", "comp_psi1", preamble=""" - #define IDX_MQ(n,m,q) ((n*M+m)*Q+q) - #define IDX_Q(n,q) (n*Q+q) + #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) + #define IDX_NQ(n,q) (q*N+n) + #define IDX_MQ(m,q) (q*M+m) + #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) + + __device__ double comp_psi1_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) + { + int n = idx%N; + int m = idx/N; + double psi1_exp=0; + for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) { - int n = idx/M; - int m = idx%M; - double psi1=0; + int n = idx%N; + int m = idx/N; + double psi1_exp=0; for(int q=0;q=exp2?exp1+log(1.0+exp(exp2-exp1)):exp2+log(1.0+exp(exp1-exp2)); + double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)]; + double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi1denom[IDX_NQ(n,q)] + muZ*muZ/(S[IDX_NQ(n,q)]+l[q]) )/2.0; + double exp2 = log1Gamma[IDX_NQ(n,q)] - Z[IDX_MQ(m,q)]*Z[IDX_MQ(m,q)]/(l[q]*2.0); + psi1_exp += LOGEXPSUM(exp1,exp2); } - return var*exp(psi1); + return var*exp(psi1_exp); + } + """) + + # The kernel form computing psi2 het_noise + comp_psi2_het = ElementwiseKernel( + "double *psi2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q", + "psi2[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", + "comp_psi2", + preamble=""" + #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) + #define IDX_NQ(n,q) (q*N+n) + #define IDX_MQ(m,q) (q*M+m) + #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) + + __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + { + // psi2 (n,m1,m2) + int m2 = idx/(M*N); + int m1 = (idx%(M*N))/N; + int n = idx%N; + + double psi2_exp=0; + for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) + + __device__ double comp_psi1_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + { + // psi2 (n,m1,m2) + int m2 = idx/(M*N); + int m1 = (idx%(M*N))/N; + int n = idx%N; + + double psi2_exp=0; + for(int q=0;q