[GPU] varDTC_gpu ready

2026-05-10 20:42:39 +02:00 · 2014-05-09 16:12:16 +01:00 · 2014-05-09 16:12:16 +01:00 · a702b0862b
commit a702b0862b
parent ddbf15d763
4 changed files with 52 additions and 113 deletions
--- a/GPy/inference/latent_function_inference/var_dtc_gpu.py
+++ b/GPy/inference/latent_function_inference/var_dtc_gpu.py
@ -70,10 +70,6 @@ class VarDTC_GPU(object):
                             'dL_dpsi0_gpu'         :gpuarray.empty((self.batchsize,),np.float64,order='F'),
                             'dL_dpsi1_gpu'         :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
                             'dL_dpsi2_gpu'         :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
                             'dL_dthetaL_gpu'       :gpuarray.empty((self.batchsize,),np.float64,order='F'),
                             'betapsi1_gpu'         :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
                             'thetaL_t_gpu'         :gpuarray.empty((self.batchsize,),np.float64,order='F'),
                             'betaYT2_gpu'          :gpuarray.empty((output_dim,self.batchsize),np.float64,order='F'),
                             'psi0p_gpu'            :gpuarray.empty((self.batchsize,),np.float64,order='F'),
                             'psi1p_gpu'            :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
                             'psi2p_gpu'            :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
@ -377,12 +373,11 @@ class VarDTC_GPU(object):
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================        
-        if uncertain_inputs and not het_noise:
+        if not het_noise:
-            dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2.
+            dL_dthetaL = (YRY_full + output_dim*psi0_full - num_data*output_dim)/-2.
-            dL_dthetaL += -beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1)
+            dL_dthetaL += cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1)
-            dL_dthetaL += -beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1)
+            dL_dthetaL += cublas.cublasDdot(self.cublas_handle,v_gpu.size, v_gpu.gpudata,1,psi1Y_gpu.gpudata,1)
-            dL_dthetaL += traceDot(v_gpu, psi1Y_gpu, num_inducing, output_dim)
+            self.midRes['dL_dthetaL'] = -beta*dL_dthetaL
            self.midRes['dL_dthetaL'] = dL_dthetaL
        return logL, dL_dKmm_gpu.get(), post
@ -419,22 +414,22 @@ class VarDTC_GPU(object):
            beta = beta[n_start] # nSlice==1
        if kern.useGPU:
-            if uncertain_inputs:
+            if not uncertain_inputs:
                psi0p_gpu = kern.psi0(Z, X_slice)
                psi1p_gpu = kern.psi1(Z, X_slice)
                psi2p_gpu = kern.psi2(Z, X_slice)
            else:
                psi0p_gpu = kern.Kdiag(X_slice)
                psi1p_gpu = kern.K(X_slice, Z)
                psi2p_gpu = self.gpuCache['psi2p_gpu']
-        else:
+            elif het_noise:
-            if uncertain_inputs:
+                psi0p_gpu = kern.psi0(Z, X_slice)
                psi1p_gpu = kern.psi1(Z, X_slice)
                psi2p_gpu = kern.psi2(Z, X_slice)
        elif not uncertain_inputs or het_noise:
            if not uncertain_inputs:
                psi0 = kern.Kdiag(X_slice)
                psi1 = kern.K(X_slice, Z)
            elif het_noise:
                psi0 = kern.psi0(Z, X_slice)
                psi1 = kern.psi1(Z, X_slice)
                psi2 = kern.psi2(Z, X_slice)
            else:
                psi0 = kern.Kdiag(X_slice)
                psi1 = kern.K(X_slice, Z)
            psi0p_gpu = self.gpuCache['psi0p_gpu']
            psi1p_gpu = self.gpuCache['psi1p_gpu']
@ -448,41 +443,21 @@ class VarDTC_GPU(object):
                psi2p_gpu.set(np.asfortranarray(psi2))
        #======================================================================
-        # Prepare gpu memory
+        # Compute dL_dpsi
        #======================================================================
        dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu']
        v_gpu = self.gpuCache['v_gpu']        
        betaYT_gpu = self.gpuCache['betaYT_gpu']
        beta_gpu = self.gpuCache['beta_gpu']
        dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu']
        dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
        dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
-        dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu']
+        betaYT_gpu = self.gpuCache['betaYT_gpu']
        psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
        betapsi1_gpu = self.gpuCache['betapsi1_gpu']
        thetaL_t_gpu = self.gpuCache['thetaL_t_gpu']
        betaYT2_gpu = self.gpuCache['betaYT2_gpu']
        betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
        beta_gpu_slice = beta_gpu[n_start:n_end]
        # Adjust to the batch size
        if dL_dpsi0_gpu.shape[0] > nSlice:
            betaYT2_gpu = betaYT2_gpu[:,:nSlice]
            dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice]
            dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
            dL_dpsi2_gpu = dL_dpsi2_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
            dL_dthetaL_gpu = dL_dthetaL_gpu.ravel()[:nSlice]
            psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
            thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice]
            betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
        mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size)
        #======================================================================
        # Compute dL_dpsi
        #======================================================================
        dL_dpsi0_gpu.fill(-output_dim *beta/2.)
@ -490,52 +465,18 @@ class VarDTC_GPU(object):
        if uncertain_inputs:
            cublas.cublasDcopy(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, dL_dpsi2_gpu.gpudata, 1)
-            cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.szie, beta, dL_dpsi2_gpu.gpudata, 1)
+            cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.size, beta, dL_dpsi2_gpu.gpudata, 1)
        else:
-            cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice)
+            cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, beta, psi1p_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice)
        #======================================================================
        # Compute dL_dthetaL
        #======================================================================
-        
+        if het_noise:
-        if uncertain_inputs and not het_noise:
+            betaY = betaYT_gpu_slice.get()
-            if isEnd:
+            dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0p_gpu.get())-output_dim*beta)/2.
-                dL_dthetaL = self.midRes['dL_dthetaL']
+            dL_dthetaL += -beta*beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2p_gpu.gpudata,1)
-            else:
+            dL_dthetaL += -beta*(betaY*np.dot(psi1p_gpu.get(),v_gpu.get())).sum(axis=-1)
                dL_dthetaL = 0
 #         if not uncertain_inputs:
 #             cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, nSlice, beta, psi1p_gpu.gpudata, nSlice, psi1p_gpu.gpudata, nSlice, 1.0, psi2p_gpu.gpudata, num_inducing)
 #            join_prod(psi2p_gpu,psi1p_gpu,psi1p_gpu,nSlice,num_inducing)
 #             psiR = cublas.cublasDdot(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, psi2p_gpu.gpudata, 1)
 #     #        mul_bcast_first(psi2R_gpu,dL_dpsi2R_gpu,psi2p_gpu,nSlice)
 #     
 #             dL_dthetaL_gpu.fill(0.)
 #             dL_dthetaL = 0.
 #             
 #             dL_dthetaL += cublas.cublasDdot(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata,1,betaYT_gpu_slice.gpudata,1)
 #             
 #             cublas.cublasDcopy(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata, 1, betaYT2_gpu.gpudata, 1)
 #             mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT2_gpu,betaYT2_gpu.size)
 #             cublas.cublasDscal(self.cublas_handle, betaYT2_gpu.size, 0.5, betaYT2_gpu.gpudata, 1)
 #             sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
 #             
 #             cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/(-2.0), beta_gpu_slice.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
 #             cublas.cublasDcopy(self.cublas_handle, beta_gpu_slice.size, beta_gpu_slice.gpudata, 1, thetaL_t_gpu.gpudata, 1)
 #             mul_bcast(thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu.size)
 #             mul_bcast(thetaL_t_gpu,thetaL_t_gpu,psi0p_gpu,thetaL_t_gpu.size)
 #             cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/2.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
 #             
 #             thetaL_t_gpu.fill(0.)
 #             sum_axis(thetaL_t_gpu, psi2R_gpu, nSlice, num_inducing*num_inducing)
 #             mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
 #             mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
 #             cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
 #             
 #             cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, -1.0, v_gpu.gpudata, num_inducing, betapsi1_gpu.gpudata, nSlice, 0.0, betaYT2_gpu.gpudata, output_dim)
 #             mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size)
 #             sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
        if kern.useGPU:
            dL_dpsi0 = dL_dpsi0_gpu
@ -548,10 +489,11 @@ class VarDTC_GPU(object):
                dL_dpsi2 = dL_dpsi2_gpu
            else:
                dL_dpsi2 = dL_dpsi2_gpu.get()
-        if het_noise:
+        if not het_noise:
-            dL_dthetaL = dL_dthetaL_gpu.get()
+            if isEnd:
-        else:
+                dL_dthetaL = self.midRes['dL_dthetaL']
-            dL_dthetaL = gpuarray.sum(dL_dthetaL_gpu).get()
+            else:
                dL_dthetaL = 0.
        if uncertain_inputs:
            grad_dict = {'dL_dpsi0':dL_dpsi0,
                         'dL_dpsi1':dL_dpsi1,
--- a/GPy/inference/latent_function_inference/var_dtc_parallel.py
+++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py
@ -116,7 +116,7 @@ class VarDTC_minibatch(object):
                if het_noise:
                    psi2_full += beta_slice*np.outer(psi1,psi1)
                else:
-                    psi2_full += np.outer(psi1.T,psi1.T)
+                    psi2_full += np.dot(psi1.T,psi1)
        if not het_noise:
            psi0_full *= beta
@ -176,7 +176,7 @@ class VarDTC_minibatch(object):
        # Compute dL_dthetaL for uncertian input and non-heter noise
        #======================================================================        
-        if uncertain_inputs and not het_noise:
+        if not het_noise:
            dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
            self.midRes['dL_dthetaL'] = dL_dthetaL
@ -265,14 +265,10 @@ class VarDTC_minibatch(object):
            dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
        else:
-            if uncertain_inputs:
+            if isEnd:
-                if isEnd:
+                dL_dthetaL = self.midRes['dL_dthetaL']
                    dL_dthetaL = self.midRes['dL_dthetaL']
                else:
                    dL_dthetaL = 0.
            else:
-                psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
+                dL_dthetaL = 0.
                dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum()
        if uncertain_inputs:
            grad_dict = {'dL_dpsi0':dL_dpsi0,
--- a/GPy/models/ss_gplvm.py
+++ b/GPy/models/ss_gplvm.py
@ -42,7 +42,8 @@ class SSGPLVM(SparseGP):
            X_variance = np.random.uniform(0,.1,X.shape)
        gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
-        gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
+        #gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
        gamma[:] = 0.5
        if group_spike:
            gamma[:] = gamma.mean(axis=0)
--- a/GPy/util/linalg_gpu.py
+++ b/GPy/util/linalg_gpu.py
@ -19,8 +19,8 @@ try:
    strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step")
-    # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - n x m, B - m x n
+    # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - a1 x a2, B - a2 x a1
-    traceDot = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%n)*m+i/n]", arguments="double *A, double *B, int n, int m")
+    traceDot = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%a1)*a2+i/a1]", arguments="double *A, double *B, int a1, int a2")
    #=======================================================================================
    # Element-wise functions