From 4465c5be8de893e277dcce34288a1720035c8103 Mon Sep 17 00:00:00 2001 From: Zhenwen Dai Date: Thu, 27 Mar 2014 17:12:17 +0000 Subject: [PATCH] [GPU] psicommputation --- GPy/kern/_src/kern.py | 5 +- GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py | 125 ++++++++++++++++++-- GPy/kern/_src/rbf.py | 24 +++- GPy/models/ss_gplvm.py | 4 +- GPy/util/linalg_gpu.py | 14 +++ 5 files changed, 153 insertions(+), 19 deletions(-) diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 31fa8690..be8a15b2 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -16,7 +16,8 @@ class Kern(Parameterized): __metaclass__ = KernCallsViaSlicerMeta #=========================================================================== _debug=False - def __init__(self, input_dim, active_dims, name, *a, **kw): + _support_GPU=False + def __init__(self, input_dim, active_dims, name, useGPU=False,*a, **kw): """ The base class for a kernel: a positive definite function which forms of a covariance function (kernel). @@ -40,6 +41,8 @@ class Kern(Parameterized): active_dim_size = len(self.active_dims) assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims) self._sliced_X = 0 + + self.useGPU = self._support_GPU and useGPU @Cache_this(limit=10) def _slice_X(self, X): diff --git a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py index 467b779d..071d8795 100644 --- a/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py +++ b/GPy/kern/_src/psi_comp/ssrbf_psi_gpucomp.py @@ -15,6 +15,7 @@ try: import pycuda.autoinit from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel + from ....util import linalg_gpu # The kernel form computing psi1 comp_psi1 = ElementwiseKernel( @@ -45,15 +46,15 @@ try: # The kernel form computing psi1 het_noise comp_psi1_het = ElementwiseKernel( "double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q", - "psi1[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", - "comp_psi1", + "psi1[i] = comp_psi1_element_het(var,l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)", + "comp_psi1_het", preamble=""" #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) #define IDX_NQ(n,q) (q*N+n) #define IDX_MQ(m,q) (q*M+m) #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) + __device__ double comp_psi1_element_het(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx) { int n = idx%N; int m = idx/N; @@ -71,15 +72,15 @@ try: # The kernel form computing psi2 het_noise comp_psi2_het = ElementwiseKernel( "double *psi2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q", - "psi2[i] = comp_psi1_element(var,l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", - "comp_psi2", + "psi2[i] = comp_psi2_element_het(var,l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)", + "comp_psi2_het", preamble=""" #define IDX_NMQ(n,m,q) ((q*M+m)*N+n) #define IDX_NQ(n,q) (q*N+n) #define IDX_MQ(m,q) (q*M+m) #define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + __device__ double comp_psi2_element_het(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) { // psi2 (n,m1,m2) int m2 = idx/(M*N); @@ -90,7 +91,7 @@ try: for(int q=0;q=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b))) - __device__ double comp_psi1_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) + __device__ double comp_psi2_element(double var, double l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx) { // psi2 (n,m1,m2) int m2 = idx/(M*N); @@ -120,19 +121,117 @@ try: for(int q=0;q1: + het_noise = True + else: + het_noise = False + + N = mu.shape[0] + M = Z.shape[0] + Q = mu.shape[1] + + self._initGPUCache(N,M,Q) + if het_noise: + l_gpu = self.gpuCache['l_gpu'] + l_gpu.set(np.asfortranarray(lengthscale**2)) + else: + lengthscale2 = lengthscale**2 + + Z_gpu = self.gpuCache['Z_gpu'] + mu_gpu = self.gpuCache['mu_gpu'] + S_gpu = self.gpuCache['S_gpu'] + gamma_gpu = self.gpuCache['gamma_gpu'] + logGamma_gpu = self.gpuCache['logGamma_gpu'] + log1Gamma_gpu = self.gpuCache['log1Gamma_gpu'] + logpsidenom_gpu = self.gpuCache['logpsidenom_gpu'] + psi0_gpu = self.gpuCache['psi0_gpu'] + psi1_gpu = self.gpuCache['psi1_gpu'] + psi2_gpu = self.gpuCache['psi2_gpu'] + + Z_gpu.set(np.asfortranarray(Z)) + mu_gpu.set(np.asfortranarray(mu)) + S_gpu.set(S) + gamma_gpu.set(gamma) + linalg_gpu.log(gamma_gpu,logGamma_gpu) + linalg_gpu.logOne(gamma_gpu,log1Gamma_gpu) + + psi0_gpu.fill(variance) + if het_noise: + comp_logpsidenom_het(logpsidenom_gpu, S_gpu,l_gpu,1.0) + comp_psi1_het(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + comp_logpsidenom_het(logpsidenom_gpu, S_gpu,l_gpu,2.0) + comp_psi2_het(psi2_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + else: + comp_logpsidenom(logpsidenom_gpu, S_gpu,lengthscale2,1.0) + comp_psi1(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + comp_logpsidenom(logpsidenom_gpu, S_gpu,lengthscale2,2.0) + comp_psi2(psi2_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsidenom_gpu, N, M, Q) + + return psi0_gpu.get(), psi1_gpu.get(), psi2_gpu.get() + @Cache_this(limit=1) def _Z_distances(Z): @@ -199,7 +298,7 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma): logGamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(gamma))) log1Gamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(1.-gamma))) logpsi1denom_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(S/lengthscale2+1.))) - psi1_gpu = gpuarray.empty((mu.shape[0],Z.shape[0]),np.float64) + psi1_gpu = gpuarray.empty((mu.shape[0],Z.shape[0]),np.float64, order='F') comp_psi1(psi1_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q) @@ -265,7 +364,7 @@ def _psi2computations(variance, lengthscale, Z, mu, S, gamma): logGamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(gamma))) log1Gamma_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(1.-gamma))) logpsi2denom_gpu = gpuarray.to_gpu(np.asfortranarray(np.log(2.*S/lengthscale2+1.))) - psi2_gpu = gpuarray.empty((mu.shape[0],Z.shape[0],Z.shape[0]),np.float64) + psi2_gpu = gpuarray.empty((mu.shape[0],Z.shape[0],Z.shape[0]),np.float64, order='F') comp_psi2(psi2_gpu, variance, lengthscale2, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q) diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index 0cf8b8de..20234c99 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -8,7 +8,8 @@ from ...util.misc import param_to_array from stationary import Stationary from GPy.util.caching import Cache_this from ...core.parameterization import variational -from psi_comp import ssrbf_psi_gpucomp as ssrbf_psi_comp +from psi_comp import ssrbf_psi_comp +from psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF class RBF(Stationary): """ @@ -19,10 +20,15 @@ class RBF(Stationary): k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) """ + _support_GPU = True def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='rbf'): super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name) self.weave_options = {} self.group_spike_prob = False + + if self.useGPU: + self.psicomp = PSICOMP_SSRBF() + def K_of_r(self, r): return self.variance * np.exp(-0.5 * r**2) @@ -35,18 +41,28 @@ class RBF(Stationary): #---------------------------------------# def psi0(self, Z, variational_posterior): - return self.Kdiag(variational_posterior.mean) + if self.useGPU: + if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0] + else: + return self.Kdiag(variational_posterior.mean) def psi1(self, Z, variational_posterior): if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + if self.useGPU: + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1] + else: + psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) else: _, _, _, psi1 = self._psi1computations(Z, variational_posterior) return psi1 def psi2(self, Z, variational_posterior): if isinstance(variational_posterior, variational.SpikeAndSlabPosterior): - psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) + if self.useGPU: + return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2] + else: + psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob) else: _, _, _, _, psi2 = self._psi2computations(Z, variational_posterior) return psi2 diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py index ec0f032a..eb7c4428 100644 --- a/GPy/models/ss_gplvm.py +++ b/GPy/models/ss_gplvm.py @@ -62,7 +62,9 @@ class SSGPLVM(SparseGP): if group_spike: kernel.group_spike_prob = True self.variational_prior.group_spike_prob = True - + + if isinstance(inference_method, VarDTC_GPU) and self.kern._support_GPU: + self.kern.useGPU = True SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs) self.add_parameter(self.X, index=0) diff --git a/GPy/util/linalg_gpu.py b/GPy/util/linalg_gpu.py index 12d5a823..d2528a63 100644 --- a/GPy/util/linalg_gpu.py +++ b/GPy/util/linalg_gpu.py @@ -10,6 +10,20 @@ import numpy as np try: import pycuda.autoinit from pycuda.reduction import ReductionKernel + from pycuda.elementwise import ElementwiseKernel + + # log|A| for A is a low triangle matrix + # logDiagSum(A, A.shape[0]+1) logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") + + #======================================================================================= + # Element-wise functions + #======================================================================================= + + # log(X) + log = ElementwiseKernel("double *in, double *out", "out[i] = log(in[i])", "log_element") + + # log(1.0-X) + logOne = ElementwiseKernel("double *in, double *out", "out[i] = log(1.-in[i])", "logOne_element") except: pass