diff --git a/GPy/core/model.py b/GPy/core/model.py index 90294a73..28223429 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -263,7 +263,7 @@ class Model(Parameterized): sgd.run() self.optimization_runs.append(sgd) - def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3): + def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3, df_tolerance=1e-12): """ Check the gradient of the ,odel by comparing to a numerical estimate. If the verbose flag is passed, individual @@ -322,7 +322,7 @@ class Model(Parameterized): except NotImplementedError: names = ['Variable %i' % i for i in range(len(x))] # Prepare for pretty-printing - header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical'] + header = ['Name', 'Ratio', 'Difference', 'Analytical', 'Numerical', 'dF_ratio'] max_names = max([len(names[i]) for i in range(len(names))] + [len(header[0])]) float_len = 10 cols = [max_names] @@ -359,6 +359,8 @@ class Model(Parameterized): f1 = self._objective(xx) xx[xind] -= 2.*step f2 = self._objective(xx) + df_ratio = np.abs((f1-f2)/min(f1,f2)) + df_unstable = df_ratio=D): - return Y.view(np.ndarray) - else: - return jitchol(tdot(Y)) - - def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, het_noise): - num_inducing, input_dim = Z.shape[0], Z.shape[1] - num_data, output_dim = Y.shape - trYYT = self._trYYT - psi1Y_gpu = self.gpuCache['psi1Y_gpu'] - psi2_gpu = self.gpuCache['psi2_gpu'] - beta_gpu = self.gpuCache['beta_gpu'] - YT_gpu = self.gpuCache['YT_gpu'] - betaYT_gpu = self.gpuCache['betaYT_gpu'] - - beta_gpu.fill(beta) - betaYT_gpu.fill(0.) - cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1) - YRY_full = trYYT*beta - - if kern.useGPU: - psi1Y_gpu.fill(0.) - psi2_gpu.fill(0.) - psi0_full = 0 - - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - ndata = n_end - n_start - X_slice = X[n_start:n_end] - betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1p_gpu = kern.psi1(Z, X_slice) - psi2p_gpu = kern.psi2(Z, X_slice) - else: - psi0 = kern.Kdiag(X_slice) - psi1p_gpu = kern.K(X_slice, Z) - - cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing) - - psi0_full += psi0.sum() - - if uncertain_inputs: - sum_axis(psi2_gpu,psi2p_gpu,1,1) - else: - cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing) - - psi0_full *= beta - if uncertain_inputs: - cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1) - - else: - psi2_full = np.zeros((num_inducing,num_inducing)) - psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM - psi0_full = 0. - YRY_full = 0. - - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - Y_slice = Y[n_start:n_end] - X_slice = X[n_start:n_end] - - if het_noise: - b = beta[n_start] - YRY_full += np.inner(Y_slice, Y_slice)*b - else: - b = beta - - if uncertain_inputs: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2_full += kern.psi2(Z, X_slice)*b - else: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - psi2_full += np.dot(psi1.T,psi1)*b - - psi0_full += psi0.sum()*b - psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM - - if not het_noise: - YRY_full = trYYT*beta - psi1Y_gpu.set(psi1Y_full) - psi2_gpu.set(psi2_full) - - return psi0_full, YRY_full - - def inference_likelihood(self, kern, X, Z, likelihood, Y): - """ - The first phase of inference: - Compute: log-likelihood, dL_dKmm - - Cached intermediate results: Kmm, KmmInv, - """ - - num_inducing, input_dim = Z.shape[0], Z.shape[1] - num_data, output_dim = Y.shape - - #see whether we've got a different noise variance for each datum - beta = 1./np.fmax(likelihood.variance, 1e-6) - het_noise = beta.size > 1 - if het_noise: - self.batchsize=0 - - self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y) - - if isinstance(X, VariationalPosterior): - uncertain_inputs = True - else: - uncertain_inputs = False - - psi1Y_gpu = self.gpuCache['psi1Y_gpu'] - psi2_gpu = self.gpuCache['psi2_gpu'] - - psi0_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs, het_noise) - - #====================================================================== - # Compute Common Components - #====================================================================== - - Kmm = kern.K(Z).copy() - Kmm_gpu = self.gpuCache['Kmm_gpu'] - Kmm_gpu.set(np.asfortranarray(Kmm)) - diag.add(Kmm, self.const_jitter) - ones_gpu = self.gpuCache['ones_gpu'] - cublas.cublasDaxpy(self.cublas_handle, num_inducing, self.const_jitter, ones_gpu.gpudata, 1, Kmm_gpu.gpudata, num_inducing+1) -# assert np.allclose(Kmm, Kmm_gpu.get()) - -# Lm = jitchol(Kmm) - # - Lm_gpu = self.gpuCache['Lm_gpu'] - cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lm_gpu.gpudata, 1) - culinalg.cho_factor(Lm_gpu,'L') -# print np.abs(np.tril(Lm)-np.tril(Lm_gpu.get())).max() - -# Lambda = Kmm+psi2_full -# LL = jitchol(Lambda) - # - Lambda_gpu = self.gpuCache['LL_gpu'] - cublas.cublasDcopy(self.cublas_handle, Kmm_gpu.size, Kmm_gpu.gpudata, 1, Lambda_gpu.gpudata, 1) - cublas.cublasDaxpy(self.cublas_handle, psi2_gpu.size, np.float64(1.0), psi2_gpu.gpudata, 1, Lambda_gpu.gpudata, 1) - LL_gpu = Lambda_gpu - culinalg.cho_factor(LL_gpu,'L') -# print np.abs(np.tril(LL)-np.tril(LL_gpu.get())).max() - -# b,_ = dtrtrs(LL, psi1Y_full) -# bbt_cpu = np.square(b).sum() - # - b_gpu = self.gpuCache['b_gpu'] - cublas.cublasDcopy(self.cublas_handle, b_gpu.size, psi1Y_gpu.gpudata, 1, b_gpu.gpudata, 1) - cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, b_gpu.gpudata, num_inducing) - bbt = cublas.cublasDdot(self.cublas_handle, b_gpu.size, b_gpu.gpudata, 1, b_gpu.gpudata, 1) -# print np.abs(bbt-bbt_cpu) - -# v,_ = dtrtrs(LL.T,b,lower=False) -# vvt = np.einsum('md,od->mo',v,v) -# LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') - # - v_gpu = self.gpuCache['v_gpu'] - cublas.cublasDcopy(self.cublas_handle, v_gpu.size, b_gpu.gpudata, 1, v_gpu.gpudata, 1) - cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, output_dim, np.float64(1.0), LL_gpu.gpudata, num_inducing, v_gpu.gpudata, num_inducing) - vvt_gpu = self.gpuCache['vvt_gpu'] - cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, output_dim, np.float64(1.0), v_gpu.gpudata, num_inducing, v_gpu.gpudata, num_inducing, np.float64(0.), vvt_gpu.gpudata, num_inducing) - LmInvPsi2LmInvT_gpu = self.gpuCache['KmmInvPsi2LLInvT_gpu'] - cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, LmInvPsi2LmInvT_gpu.gpudata, 1) - cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) - cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing) - #tr_LmInvPsi2LmInvT = cublas.cublasDasum(self.cublas_handle, num_inducing, LmInvPsi2LmInvT_gpu.gpudata, num_inducing+1) - tr_LmInvPsi2LmInvT = float(strideSum(LmInvPsi2LmInvT_gpu, num_inducing+1).get()) -# print np.abs(vvt-vvt_gpu.get()).max() -# print np.abs(np.trace(LmInvPsi2LmInvT)-tr_LmInvPsi2LmInvT) - -# Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T -# LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0] -# KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0] -# KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T - # - KmmInvPsi2LLInvT_gpu = LmInvPsi2LmInvT_gpu # Reuse GPU memory (size:MxM) - cublas.cublasDcopy(self.cublas_handle, psi2_gpu.size, psi2_gpu.gpudata, 1, KmmInvPsi2LLInvT_gpu.gpudata, 1) - cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) - cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) - cublas.cublasDtrsm(self.cublas_handle , 'L', 'L', 'T', 'N', num_inducing, num_inducing, np.float64(1.0), Lm_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing) - KmmInvPsi2P_gpu = self.gpuCache['KmmInvPsi2P_gpu'] - cublas.cublasDcopy(self.cublas_handle, KmmInvPsi2LLInvT_gpu.size, KmmInvPsi2LLInvT_gpu.gpudata, 1, KmmInvPsi2P_gpu.gpudata, 1) - cublas.cublasDtrsm(self.cublas_handle , 'r', 'L', 'N', 'N', num_inducing, num_inducing, np.float64(1.0), LL_gpu.gpudata, num_inducing, KmmInvPsi2P_gpu.gpudata, num_inducing) -# print np.abs(KmmInvPsi2P-KmmInvPsi2P_gpu.get()).max() - -# dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2 - # - dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] - cublas.cublasDcopy(self.cublas_handle, vvt_gpu.size, vvt_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) - cublas.cublasDaxpy(self.cublas_handle, KmmInvPsi2P_gpu.size, np.float64(-output_dim), KmmInvPsi2P_gpu.gpudata, 1, dL_dpsi2R_gpu.gpudata, 1) - cublas.cublasDscal(self.cublas_handle, dL_dpsi2R_gpu.size, np.float64(-0.5), dL_dpsi2R_gpu.gpudata, 1) -# print np.abs(dL_dpsi2R_gpu.get()-dL_dpsi2R).max() - - #====================================================================== - # Compute log-likelihood - #====================================================================== - if het_noise: - logL_R = -np.log(beta).sum() - else: - logL_R = -num_data*np.log(beta) -# logL_old = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum()) - - logdetKmm = float(logDiagSum(Lm_gpu,num_inducing+1).get()) - logdetLambda = float(logDiagSum(LL_gpu,num_inducing+1).get()) - logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-tr_LmInvPsi2LmInvT)+YRY_full-bbt)/2.+output_dim*(logdetKmm-logdetLambda) -# print np.abs(logL_old - logL) - - #====================================================================== - # Compute dL_dKmm - #====================================================================== - -# dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2. - # - dL_dKmm_gpu = self.gpuCache['dL_dKmm_gpu'] - cublas.cublasDgemm(self.cublas_handle, 'N', 'T', num_inducing, num_inducing, num_inducing, np.float64(1.0), KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, KmmInvPsi2LLInvT_gpu.gpudata, num_inducing, np.float64(0.), dL_dKmm_gpu.gpudata, num_inducing) - cublas.cublasDaxpy(self.cublas_handle, dL_dKmm_gpu.size, np.float64(1./output_dim), vvt_gpu.gpudata, 1, dL_dKmm_gpu.gpudata, 1) - cublas.cublasDscal(self.cublas_handle, dL_dKmm_gpu.size, np.float64(-output_dim/2.), dL_dKmm_gpu.gpudata, 1) -# print np.abs(dL_dKmm - dL_dKmm_gpu.get()).max() - - #====================================================================== - # Compute the Posterior distribution of inducing points p(u|Y) - #====================================================================== - - post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get()) - - #====================================================================== - # Compute dL_dthetaL for uncertian input and non-heter noise - #====================================================================== - - if not het_noise: - dL_dthetaL = (YRY_full + output_dim*psi0_full - num_data*output_dim)/-2. - dL_dthetaL += cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1) - dL_dthetaL += cublas.cublasDdot(self.cublas_handle,v_gpu.size, v_gpu.gpudata,1,psi1Y_gpu.gpudata,1) - self.midRes['dL_dthetaL'] = -beta*dL_dthetaL - - return logL, dL_dKmm_gpu.get(), post - - def inference_minibatch(self, kern, X, Z, likelihood, Y): - """ - The second phase of inference: Computing the derivatives over a minibatch of Y - Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL - return a flag showing whether it reached the end of Y (isEnd) - """ - - num_data, output_dim = Y.shape - num_inducing = Z.shape[0] - - if isinstance(X, VariationalPosterior): - uncertain_inputs = True - else: - uncertain_inputs = False - - beta = 1./np.fmax(likelihood.variance, 1e-6) - het_noise = beta.size > 1 - - n_start = self.batch_pos - n_end = min(self.batchsize+n_start, num_data) - if n_end==num_data: - isEnd = True - self.batch_pos = 0 - else: - isEnd = False - self.batch_pos = n_end - - nSlice = n_end-n_start - X_slice = X[n_start:n_end] - if het_noise: - beta = beta[n_start] # nSlice==1 - - if kern.useGPU: - if not uncertain_inputs: - psi0p_gpu = kern.Kdiag(X_slice) - psi1p_gpu = kern.K(X_slice, Z) - psi2p_gpu = self.gpuCache['psi2p_gpu'] - elif het_noise: - psi0p_gpu = kern.psi0(Z, X_slice) - psi1p_gpu = kern.psi1(Z, X_slice) - psi2p_gpu = kern.psi2(Z, X_slice) - elif not uncertain_inputs or het_noise: - if not uncertain_inputs: - psi0 = kern.Kdiag(X_slice) - psi1 = kern.K(X_slice, Z) - elif het_noise: - psi0 = kern.psi0(Z, X_slice) - psi1 = kern.psi1(Z, X_slice) - psi2 = kern.psi2(Z, X_slice) - - psi0p_gpu = self.gpuCache['psi0p_gpu'] - psi1p_gpu = self.gpuCache['psi1p_gpu'] - psi2p_gpu = self.gpuCache['psi2p_gpu'] - if psi0p_gpu.shape[0] > nSlice: - psi0p_gpu = psi0p_gpu[:nSlice] - psi1p_gpu = psi1p_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - psi0p_gpu.set(np.asfortranarray(psi0)) - psi1p_gpu.set(np.asfortranarray(psi1)) - if uncertain_inputs: - psi2p_gpu.set(np.asfortranarray(psi2)) - - #====================================================================== - # Compute dL_dpsi - #====================================================================== - - dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu'] - v_gpu = self.gpuCache['v_gpu'] - dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu'] - dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu'] - dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu'] - betaYT_gpu = self.gpuCache['betaYT_gpu'] - betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end] - - # Adjust to the batch size - if dL_dpsi0_gpu.shape[0] > nSlice: - dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice] - dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing) - - dL_dpsi0_gpu.fill(-output_dim *beta/2.) - - cublas.cublasDgemm(self.cublas_handle, 'T', 'T', nSlice, num_inducing, output_dim, 1.0, betaYT_gpu_slice.gpudata, output_dim, v_gpu.gpudata, num_inducing, 0., dL_dpsi1_gpu.gpudata, nSlice) - - if uncertain_inputs: - cublas.cublasDcopy(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, dL_dpsi2_gpu.gpudata, 1) - cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.size, beta, dL_dpsi2_gpu.gpudata, 1) - else: - cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, beta, psi1p_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice) - - #====================================================================== - # Compute dL_dthetaL - #====================================================================== - if het_noise: - betaY = betaYT_gpu_slice.get() - dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0p_gpu.get())-output_dim*beta)/2. - dL_dthetaL += -beta*beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2p_gpu.gpudata,1) - dL_dthetaL += -beta*(betaY*np.dot(psi1p_gpu.get(),v_gpu.get())).sum(axis=-1) - - if kern.useGPU: - dL_dpsi0 = dL_dpsi0_gpu - dL_dpsi1 = dL_dpsi1_gpu - else: - dL_dpsi0 = dL_dpsi0_gpu.get() - dL_dpsi1 = dL_dpsi1_gpu.get() - if uncertain_inputs: - if kern.useGPU: - dL_dpsi2 = dL_dpsi2_gpu - else: - dL_dpsi2 = dL_dpsi2_gpu.get() - if not het_noise: - if isEnd: - dL_dthetaL = self.midRes['dL_dthetaL'] - else: - dL_dthetaL = 0. - if uncertain_inputs: - grad_dict = {'dL_dpsi0':dL_dpsi0, - 'dL_dpsi1':dL_dpsi1, - 'dL_dpsi2':dL_dpsi2, - 'dL_dthetaL':dL_dthetaL} - else: - grad_dict = {'dL_dKdiag':dL_dpsi0, - 'dL_dKnm':dL_dpsi1, - 'dL_dthetaL':dL_dthetaL} - - return isEnd, (n_start,n_end), grad_dict - diff --git a/GPy/inference/latent_function_inference/var_dtc_parallel.py b/GPy/inference/latent_function_inference/var_dtc_parallel.py index ae25f3e3..86842687 100644 --- a/GPy/inference/latent_function_inference/var_dtc_parallel.py +++ b/GPy/inference/latent_function_inference/var_dtc_parallel.py @@ -38,7 +38,7 @@ class VarDTC_minibatch(LatentFunctionInference): self.midRes = {} self.batch_pos = 0 # the starting position of the current mini-batch - self.Y_speedup = False # Replace Y with the cholesky factor of YY.T, but the posterior inference will be wrong + self.Y_speedup = False # Replace Y with the cholesky factor of YY.T, but the computation of posterior object will be skipped. def __getstate__(self): # has to be overridden, as Cacher objects cannot be pickled. @@ -76,6 +76,8 @@ class VarDTC_minibatch(LatentFunctionInference): def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs): het_noise = beta.size > 1 + + assert beta.size == 1 trYYT = self.get_trYYT(Y) if self.Y_speedup and not het_noise: @@ -83,17 +85,16 @@ class VarDTC_minibatch(LatentFunctionInference): num_inducing = Z.shape[0] num_data, output_dim = Y.shape - if self.batchsize == None: - self.batchsize = num_data + batchsize = num_data if self.batchsize is None else self.batchsize - psi2_full = np.zeros((num_inducing,num_inducing)) + psi2_full = np.zeros((num_inducing,num_inducing)) # MxM psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM psi0_full = 0. YRY_full = 0. - for n_start in xrange(0,num_data,self.batchsize): - n_end = min(self.batchsize+n_start, num_data) - if (n_end-n_start)==num_data: + for n_start in xrange(0,num_data,batchsize): + n_end = min(batchsize+n_start, num_data) + if batchsize==num_data: Y_slice = Y X_slice = X else: @@ -168,16 +169,18 @@ class VarDTC_minibatch(LatentFunctionInference): Kmm = kern.K(Z).copy() diag.add(Kmm, self.const_jitter) - KmmInv,Lm,LmInv,_ = pdinv(Kmm) + Lm = jitchol(Kmm, maxtries=100) - LmInvPsi2LmInvT = LmInv.dot(psi2_full).dot(LmInv.T) + LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right') Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT - LInv,LL,LLInv,logdet_L = pdinv(Lambda) - b = LLInv.dot(LmInv.dot(psi1Y_full.T)) + LL = jitchol(Lambda, maxtries=100) + logdet_L = 2.*np.sum(np.log(np.diag(LL))) + b = dtrtrs(LL,dtrtrs(Lm,psi1Y_full.T)[0])[0] bbt = np.square(b).sum() - v = LmInv.T.dot(LLInv.T.dot(b)) + v = dtrtrs(Lm,dtrtrs(LL,b,trans=1)[0],trans=1)[0] - dL_dpsi2R = LmInv.T.dot(-LLInv.T.dot(tdot(b)+output_dim*np.eye(input_dim)).dot(LLInv)+output_dim*np.eye(input_dim)).dot(LmInv)/2. + tmp = -backsub_both_sides(LL, tdot(b)+output_dim*np.eye(input_dim), transpose='left') + dL_dpsi2R = backsub_both_sides(Lm, tmp+output_dim*np.eye(input_dim), transpose='left')/2. # Cache intermediate results self.midRes['dL_dpsi2R'] = dL_dpsi2R @@ -196,14 +199,15 @@ class VarDTC_minibatch(LatentFunctionInference): # Compute dL_dKmm #====================================================================== - dL_dKmm = dL_dpsi2R - output_dim*KmmInv.dot(psi2_full).dot(KmmInv)/2. + dL_dKmm = dL_dpsi2R - output_dim*backsub_both_sides(Lm, LmInvPsi2LmInvT, transpose='left')/2. #====================================================================== # Compute the Posterior distribution of inducing points p(u|Y) #====================================================================== if not self.Y_speedup or het_noise: - post = Posterior(woodbury_inv=LmInv.T.dot(np.eye(input_dim)-LInv).dot(LmInv), woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm) + wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') + post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm) else: post = None @@ -242,7 +246,8 @@ class VarDTC_minibatch(LatentFunctionInference): YYT_factor = Y n_start = self.batch_pos - n_end = min(self.batchsize+n_start, num_data) + batchsize = num_data if self.batchsize is None else self.batchsize + n_end = min(batchsize+n_start, num_data) if n_end==num_data: isEnd = True self.batch_pos = 0 @@ -250,8 +255,12 @@ class VarDTC_minibatch(LatentFunctionInference): isEnd = False self.batch_pos = n_end - Y_slice = YYT_factor[n_start:n_end] - X_slice = X[n_start:n_end] + if batchsize==num_data: + Y_slice = YYT_factor + X_slice =X + else: + Y_slice = YYT_factor[n_start:n_end] + X_slice = X[n_start:n_end] if not uncertain_inputs: psi0 = kern.Kdiag(X_slice) @@ -405,3 +414,66 @@ def update_gradients(model, mpi_comm=None): # dL_dthetaL model.likelihood.update_gradients(dL_dthetaL) + +def update_gradients_sparsegp(model, mpi_comm=None): + if mpi_comm == None: + Y = model.Y + X = model.X + else: + Y = model.Y_local + X = model.X[model.N_range[0]:model.N_range[1]] + + model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y) + + het_noise = model.likelihood.variance.size > 1 + + if het_noise: + dL_dthetaL = np.empty((model.Y.shape[0],)) + else: + dL_dthetaL = np.float64(0.) + + kern_grad = model.kern.gradient.copy() + kern_grad[:] = 0. + model.Z.gradient = 0. + + isEnd = False + while not isEnd: + isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y) + + if (n_range[1]-n_range[0])==X.shape[0]: + X_slice = X + elif mpi_comm ==None: + X_slice = model.X[n_range[0]:n_range[1]] + else: + X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]] + + model.kern.update_gradients_diag(grad_dict['dL_dKdiag'], X_slice) + kern_grad += model.kern.gradient + model.kern.update_gradients_full(grad_dict['dL_dKnm'], X_slice, model.Z) + kern_grad += model.kern.gradient + + model.Z.gradient += model.kern.gradients_X(grad_dict['dL_dKnm'].T, model.Z, X_slice) + + if het_noise: + dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL'] + else: + dL_dthetaL += grad_dict['dL_dthetaL'] + + # Gather the gradients from multiple MPI nodes + if mpi_comm != None: + if het_noise: + raise "het_noise not implemented!" + kern_grad_all = kern_grad.copy() + Z_grad_all = model.Z.gradient.copy() + mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE]) + mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE]) + kern_grad = kern_grad_all + model.Z.gradient = Z_grad_all + + model.kern.update_gradients_full(dL_dKmm, model.Z, None) + model.kern.gradient += kern_grad + + model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z) + + # dL_dthetaL + model.likelihood.update_gradients(dL_dthetaL) diff --git a/GPy/kern/_src/hierarchical.py b/GPy/kern/_src/hierarchical.py deleted file mode 100644 index ac360ec7..00000000 --- a/GPy/kern/_src/hierarchical.py +++ /dev/null @@ -1,76 +0,0 @@ -# Copyright (c) 2012, James Hesnsman -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -from kernpart import Kernpart -import numpy as np -from independent_outputs import index_to_slices - -class Hierarchical(Kernpart): - """ - A kernel part which can reopresent a hierarchy of indepencnce: a generalisation of independent_outputs - - """ - def __init__(self,parts,name='hierarchy'): - self.levels = len(parts) - self.input_dim = parts[0].input_dim + 1 - self.num_params = np.sum([k.num_params for k in parts]) - self.name = name - self.parts = parts - - self.param_starts = np.hstack((0,np.cumsum([k.num_params for k in self.parts[:-1]]))) - self.param_stops = np.cumsum([k.num_params for k in self.parts]) - - def _get_params(self): - return np.hstack([k._get_params() for k in self.parts]) - - def _set_params(self,x): - [k._set_params(x[start:stop]) for k, start, stop in zip(self.parts, self.param_starts, self.param_stops)] - - def _get_param_names(self): - return sum([[str(i)+'_'+k.name+'_'+n for n in k._get_param_names()] for i,k in enumerate(self.parts)],[]) - - def _sort_slices(self,X,X2): - slices = [index_to_slices(x) for x in X[:,-self.levels:].T] - X = X[:,:-self.levels] - if X2 is None: - slices2 = slices - X2 = X - else: - slices2 = [index_to_slices(x) for x in X2[:,-self.levels:].T] - X2 = X2[:,:-self.levels] - return X, X2, slices, slices2 - - def K(self,X,X2,target): - X, X2, slices, slices2 = self._sort_slices(X,X2) - - [[[[k.K(X[s],X2[s2],target[s,s2]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_,slices2_)] for k, slices_, slices2_ in zip(self.parts,slices,slices2)] - - def Kdiag(self,X,target): - raise NotImplementedError - #X,slices = X[:,:-1],index_to_slices(X[:,-1]) - #[[self.k.Kdiag(X[s],target[s]) for s in slices_i] for slices_i in slices] - - def _param_grad_helper(self,dL_dK,X,X2,target): - X, X2, slices, slices2 = self._sort_slices(X,X2) - [[[[k._param_grad_helper(dL_dK[s,s2],X[s],X2[s2],target[p_start:p_stop]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices_, slices2_)] for k, p_start, p_stop, slices_, slices2_ in zip(self.parts, self.param_starts, self.param_stops, slices, slices2)] - - - def gradients_X(self,dL_dK,X,X2,target): - raise NotImplementedError - #X,slices = X[:,:-1],index_to_slices(X[:,-1]) - #if X2 is None: - #X2,slices2 = X,slices - #else: - #X2,slices2 = X2[:,:-1],index_to_slices(X2[:,-1]) - #[[[self.k.gradients_X(dL_dK[s,s2],X[s],X2[s2],target[s,:-1]) for s in slices_i] for s2 in slices_j] for slices_i,slices_j in zip(slices,slices2)] -# - def dKdiag_dX(self,dL_dKdiag,X,target): - raise NotImplementedError - #X,slices = X[:,:-1],index_to_slices(X[:,-1]) - #[[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target[s,:-1]) for s in slices_i] for slices_i in slices] - - - def dKdiag_dtheta(self,dL_dKdiag,X,target): - raise NotImplementedError - #X,slices = X[:,:-1],index_to_slices(X[:,-1]) - #[[self.k.dKdiag_dX(dL_dKdiag[s],X[s],target) for s in slices_i] for slices_i in slices] diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 506d32e6..57b2bff5 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -132,13 +132,20 @@ class Kern(Parameterized): """ raise NotImplementedError - def plot(self, *args, **kwargs): + def plot(self, x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs): """ - See GPy.plotting.matplot_dep.plot + plot this kernel. + :param x: the value to use for the other kernel argument (kernels are a function of two variables!) + :param fignum: figure number of the plot + :param ax: matplotlib axis to plot on + :param title: the matplotlib title + :param plot_limits: the range over which to plot the kernel + :resolution: the resolution of the lines used in plotting + :mpl_kwargs avalid keyword arguments to pass through to matplotlib (e.g. lw=7) """ assert "matplotlib" in sys.modules, "matplotlib package has not been imported." from ...plotting.matplot_dep import kernel_plots - kernel_plots.plot(self,*args) + kernel_plots.plot(self, x, fignum, ax, title, plot_limits, resolution, **mpl_kwargs) def plot_ARD(self, *args, **kw): """ diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index 73fd6f8f..0838b684 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -7,7 +7,6 @@ from ..core.sparse_gp_mpi import SparseGP_MPI from ..likelihoods import Gaussian from ..core.parameterization.variational import NormalPosterior, NormalPrior from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch -from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU import logging class BayesianGPLVM(SparseGP_MPI): @@ -68,14 +67,12 @@ class BayesianGPLVM(SparseGP_MPI): if isinstance(inference_method,VarDTC_minibatch): inference_method.mpi_comm = mpi_comm - if kernel.useGPU and isinstance(inference_method, VarDTC_GPU): - kernel.psicomp.GPU_direct = True - super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood, name=name, inference_method=inference_method, normalizer=normalizer, mpi_comm=mpi_comm, variational_prior=self.variational_prior, ) + self.link_parameter(self.X, index=0) def set_X_gradients(self, X, X_grad): """Set the gradients of the posterior distribution of X in its specific form.""" diff --git a/GPy/models/bayesian_gplvm_minibatch.py b/GPy/models/bayesian_gplvm_minibatch.py index 7100fa72..a2e44bbe 100644 --- a/GPy/models/bayesian_gplvm_minibatch.py +++ b/GPy/models/bayesian_gplvm_minibatch.py @@ -6,7 +6,6 @@ from .. import kern from ..likelihoods import Gaussian from ..core.parameterization.variational import NormalPosterior, NormalPrior from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch -from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU import logging from GPy.models.sparse_gp_minibatch import SparseGPMiniBatch diff --git a/GPy/models/sparse_gp_regression.py b/GPy/models/sparse_gp_regression.py index 3be2aed2..5a56bb7d 100644 --- a/GPy/models/sparse_gp_regression.py +++ b/GPy/models/sparse_gp_regression.py @@ -4,12 +4,13 @@ import numpy as np from ..core import SparseGP +from ..core.sparse_gp_mpi import SparseGP_MPI from .. import likelihoods from .. import kern from ..inference.latent_function_inference import VarDTC from ..core.parameterization.variational import NormalPosterior -class SparseGPRegression(SparseGP): +class SparseGPRegression(SparseGP_MPI): """ Gaussian Process model for regression @@ -48,7 +49,14 @@ class SparseGPRegression(SparseGP): if not (X_variance is None): X = NormalPosterior(X,X_variance) - SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method=VarDTC(), normalizer=normalizer) + SparseGP_MPI.__init__(self, X, Y, Z, kernel, likelihood, inference_method=VarDTC(), normalizer=normalizer) + + def parameters_changed(self): + from ..inference.latent_function_inference.var_dtc_parallel import update_gradients_sparsegp,VarDTC_minibatch + if isinstance(self.inference_method,VarDTC_minibatch): + update_gradients_sparsegp(self, mpi_comm=self.mpi_comm) + else: + super(SparseGPRegression, self).parameters_changed() class SparseGPRegressionUncertainInput(SparseGP): """ diff --git a/GPy/models/ss_gplvm.py b/GPy/models/ss_gplvm.py index 4ea4f297..04006d84 100644 --- a/GPy/models/ss_gplvm.py +++ b/GPy/models/ss_gplvm.py @@ -8,7 +8,6 @@ from .. import kern from ..likelihoods import Gaussian from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch -from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU from ..kern._src.psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF_GPU class SSGPLVM(SparseGP_MPI): @@ -72,6 +71,7 @@ class SSGPLVM(SparseGP_MPI): super(SSGPLVM,self).__init__(X, Y, Z, kernel, likelihood, variational_prior=self.variational_prior, inference_method=inference_method, name=name, mpi_comm=mpi_comm, normalizer=normalizer, **kwargs) # self.X.unfix() # self.X.variance.constrain_positive() + self.link_parameter(self.X, index=0) if self.group_spike: [self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in xrange(self.X.gamma.shape[1])] # Tie columns together diff --git a/GPy/plotting/matplot_dep/kernel_plots.py b/GPy/plotting/matplot_dep/kernel_plots.py index c2bd7d38..dd0f1cf5 100644 --- a/GPy/plotting/matplot_dep/kernel_plots.py +++ b/GPy/plotting/matplot_dep/kernel_plots.py @@ -99,7 +99,26 @@ def plot_ARD(kernel, fignum=None, ax=None, title='', legend=False, filtering=Non return ax -def plot(kernel, x=None, plot_limits=None, resolution=None, *args, **kwargs): + +def plot(kernel,x=None, fignum=None, ax=None, title=None, plot_limits=None, resolution=None, **mpl_kwargs): + """ + plot a kernel. + :param x: the value to use for the other kernel argument (kernels are a function of two variables!) + :param fignum: figure number of the plot + :param ax: matplotlib axis to plot on + :param title: the matplotlib title + :param plot_limits: the range over which to plot the kernel + :resolution: the resolution of the lines used in plotting + :mpl_kwargs avalid keyword arguments to pass through to matplotlib (e.g. lw=7) + """ + fig, ax = ax_default(fignum,ax) + + if title is None: + ax.set_title('%s kernel' % kernel.name) + else: + ax.set_title(title) + + if kernel.input_dim == 1: if x is None: x = np.zeros((1, 1)) @@ -117,10 +136,10 @@ def plot(kernel, x=None, plot_limits=None, resolution=None, *args, **kwargs): Xnew = np.linspace(xmin, xmax, resolution or 201)[:, None] Kx = kernel.K(Xnew, x) - pb.plot(Xnew, Kx, *args, **kwargs) - pb.xlim(xmin, xmax) - pb.xlabel("x") - pb.ylabel("k(x,%0.1f)" % x) + ax.plot(Xnew, Kx, **mpl_kwargs) + ax.set_xlim(xmin, xmax) + ax.set_xlabel("x") + ax.set_ylabel("k(x,%0.1f)" % x) elif kernel.input_dim == 2: if x is None: @@ -142,11 +161,11 @@ def plot(kernel, x=None, plot_limits=None, resolution=None, *args, **kwargs): Xnew = np.vstack((xx.flatten(), yy.flatten())).T Kx = kernel.K(Xnew, x) Kx = Kx.reshape(resolution, resolution).T - pb.contour(xx, xx, Kx, vmin=Kx.min(), vmax=Kx.max(), cmap=pb.cm.jet, *args, **kwargs) # @UndefinedVariable - pb.xlim(xmin[0], xmax[0]) - pb.ylim(xmin[1], xmax[1]) - pb.xlabel("x1") - pb.ylabel("x2") - pb.title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1])) + ax.contour(xx, yy, Kx, vmin=Kx.min(), vmax=Kx.max(), cmap=pb.cm.jet, **mpl_kwargs) # @UndefinedVariable + ax.set_xlim(xmin[0], xmax[0]) + ax.set_ylim(xmin[1], xmax[1]) + ax.set_xlabel("x1") + ax.set_ylabel("x2") + ax.set_title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1])) else: raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions" diff --git a/GPy/util/mpi.py b/GPy/util/mpi.py deleted file mode 100644 index e3d7d928..00000000 --- a/GPy/util/mpi.py +++ /dev/null @@ -1,35 +0,0 @@ -""" -The tools for mpi -""" - -try: - import numpy as np - from mpi4py import MPI - numpy_to_MPI_typemap = { - np.dtype(np.float64) : MPI.DOUBLE, - np.dtype(np.float32) : MPI.FLOAT, - np.dtype(np.int) : MPI.INT, - np.dtype(np.int8) : MPI.CHAR, - np.dtype(np.uint8) : MPI.UNSIGNED_CHAR, - np.dtype(np.int32) : MPI.INT, - np.dtype(np.uint32) : MPI.UNSIGNED_INT, - } -except: - pass - -def divide_data(datanum, comm): - - residue = (datanum)%comm.size - datanum_list = np.empty((comm.size),dtype=np.int32) - for i in xrange(comm.size): - if i0 + + residue = (datanum)%size + datanum_list = np.empty((size),dtype=np.int32) + for i in xrange(size): + if i