diff --git a/GPy/inference/latent_function_inference/__init__.py b/GPy/inference/latent_function_inference/__init__.py index 3f6af799..97815a41 100644 --- a/GPy/inference/latent_function_inference/__init__.py +++ b/GPy/inference/latent_function_inference/__init__.py @@ -91,6 +91,8 @@ from .pep import PEP from .var_dtc_parallel import VarDTC_minibatch from .var_gauss import VarGauss from .gaussian_grid_inference import GaussianGridInference +from .vardtc_svi_multiout import VarDTC_SVI_Multiout +from .vardtc_svi_multiout_miss import VarDTC_SVI_Multiout_Miss # class FullLatentFunctionData(object): diff --git a/GPy/inference/latent_function_inference/vardtc_md.py b/GPy/inference/latent_function_inference/vardtc_md.py new file mode 100644 index 00000000..2297074a --- /dev/null +++ b/GPy/inference/latent_function_inference/vardtc_md.py @@ -0,0 +1,177 @@ + + + +from GPy.util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv, dpotri +from GPy.util import diag +from GPy.core.parameterization.variational import VariationalPosterior +import numpy as np +from GPy.inference.latent_function_inference import LatentFunctionInference +from GPy.inference.latent_function_inference.posterior import Posterior + +log_2_pi = np.log(2*np.pi) + +class VarDTC_MD(LatentFunctionInference): + """ + An object for inference when the likelihood is Gaussian, but we want to do sparse inference. + + The function self.inference returns a Posterior object, which summarizes + the posterior. + + For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it. + + """ + const_jitter = 1e-6 + + def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs): + + if uncertain_inputs: + psi0 = kern.psi0(Z, X) + psi1 = kern.psi1(Z, X) + psi2 = kern.psi2n(Z, X) + else: + psi0 = kern.Kdiag(X) + psi1 = kern.K(X, Z) + psi2 = psi1[:,:,None]*psi1[:,None,:] + + return psi0, psi1, psi2 + + def inference(self, kern, X, Z, likelihood, Y, indexD, output_dim, Y_metadata=None, Lm=None, dL_dKmm=None, Kuu_sigma=None): + """ + The first phase of inference: + Compute: log-likelihood, dL_dKmm + + Cached intermediate results: Kmm, KmmInv, + """ + + input_dim = Z.shape[0] + + uncertain_inputs = isinstance(X, VariationalPosterior) + + beta = 1./likelihood.variance + if len(beta)==1: + beta = np.zeros(output_dim)+beta + + beta_exp = np.zeros(indexD.shape[0]) + for d in range(output_dim): + beta_exp[indexD==d] = beta[d] + + psi0, psi1, psi2 = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs) + + psi2_sum = (beta_exp[:,None,None]*psi2).sum(0)/output_dim + + #====================================================================== + # Compute Common Components + #====================================================================== + + Kmm = kern.K(Z).copy() + if Kuu_sigma is not None: + diag.add(Kmm, Kuu_sigma) + else: + diag.add(Kmm, self.const_jitter) + Lm = jitchol(Kmm) + + logL = 0. + dL_dthetaL = np.zeros(output_dim) + dL_dKmm = np.zeros_like(Kmm) + dL_dpsi0 = np.zeros_like(psi0) + dL_dpsi1 = np.zeros_like(psi1) + dL_dpsi2 = np.zeros_like(psi2) + wv = np.empty((Kmm.shape[0],output_dim)) + + for d in range(output_dim): + idx_d = indexD==d + Y_d = Y[idx_d] + N_d = Y_d.shape[0] + beta_d = beta[d] + + psi2_d = psi2[idx_d].sum(0)*beta_d + psi1Y = Y_d.T.dot(psi1[idx_d])*beta_d + psi0_d = psi0[idx_d].sum()*beta_d + YRY_d = np.square(Y_d).sum()*beta_d + + LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_d, 'right') + + Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT + LL = jitchol(Lambda) + LmLL = Lm.dot(LL) + + b = dtrtrs(LmLL, psi1Y.T)[0].T + bbt = np.square(b).sum() + v = dtrtrs(LmLL, b.T, trans=1)[0].T + LLinvPsi1TYYTPsi1LLinvT = tdot(b.T) + + tmp = -backsub_both_sides(LL, LLinvPsi1TYYTPsi1LLinvT) + dL_dpsi2R = backsub_both_sides(Lm, tmp+np.eye(input_dim))/2 + + logL_R = -N_d*np.log(beta_d) + logL += -((N_d*log_2_pi+logL_R+psi0_d-np.trace(LmInvPsi2LmInvT))+YRY_d- bbt)/2. + + dL_dKmm += dL_dpsi2R - backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 + + dL_dthetaL[d:d+1] = (YRY_d*beta_d + beta_d*psi0_d - N_d*beta_d)/2. - beta_d*(dL_dpsi2R*psi2_d).sum() - beta_d*np.trace(LLinvPsi1TYYTPsi1LLinvT) + + dL_dpsi0[idx_d] = -beta_d/2. + dL_dpsi1[idx_d] = beta_d*np.dot(Y_d,v) + dL_dpsi2[idx_d] = beta_d*dL_dpsi2R + wv[:,d] = v + + LmInvPsi2LmInvT = backsub_both_sides(Lm, psi2_sum, 'right') + + Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT + LL = jitchol(Lambda) + LmLL = Lm.dot(LL) + logdet_L = 2.*np.sum(np.log(np.diag(LL))) + dL_dpsi2R_common = dpotri(LmLL)[0]/-2. + dL_dpsi2 += dL_dpsi2R_common[None,:,:]*beta_exp[:,None,None] + + for d in range(output_dim): + dL_dthetaL[d] += (dL_dpsi2R_common*psi2[indexD==d].sum(0)).sum()*-beta[d]*beta[d] + + dL_dKmm += dL_dpsi2R_common*output_dim + + logL += -output_dim*logdet_L/2. + + #====================================================================== + # Compute dL_dKmm + #====================================================================== + + # dL_dKmm = dL_dpsi2R - output_dim* backsub_both_sides(Lm, LmInvPsi2LmInvT)/2 #LmInv.T.dot(LmInvPsi2LmInvT).dot(LmInv)/2. + + #====================================================================== + # Compute the Posterior distribution of inducing points p(u|Y) + #====================================================================== + + LLInvLmT = dtrtrs(LL, Lm.T)[0] + cov = tdot(LLInvLmT.T) + + wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left') + post = Posterior(woodbury_inv=wd_inv, woodbury_vector=wv, K=Kmm, mean=None, cov=cov, K_chol=Lm) + + #====================================================================== + # Compute dL_dthetaL for uncertian input and non-heter noise + #====================================================================== + + # for d in range(output_dim): + # dL_dthetaL[d:d+1] += - beta[d]*beta[d]*(dL_dpsi2R[None,:,:] * psi2[indexD==d]/output_dim).sum() + # dL_dthetaL += - (dL_dpsi2R[None,:,:] * psi2_sum*D beta*(dL_dpsi2R*psi2).sum() + + #====================================================================== + # Compute dL_dpsi + #====================================================================== + + if not uncertain_inputs: + dL_dpsi1 += (psi1[:,None,:]*dL_dpsi2).sum(2)*2. + + if uncertain_inputs: + grad_dict = {'dL_dKmm': dL_dKmm, + 'dL_dpsi0':dL_dpsi0, + 'dL_dpsi1':dL_dpsi1, + 'dL_dpsi2':dL_dpsi2, + 'dL_dthetaL':dL_dthetaL} + else: + grad_dict = {'dL_dKmm': dL_dKmm, + 'dL_dKdiag':dL_dpsi0, + 'dL_dKnm':dL_dpsi1, + 'dL_dthetaL':dL_dthetaL} + + return post, logL, grad_dict diff --git a/GPy/inference/latent_function_inference/vardtc_svi_multiout.py b/GPy/inference/latent_function_inference/vardtc_svi_multiout.py new file mode 100644 index 00000000..99078ab5 --- /dev/null +++ b/GPy/inference/latent_function_inference/vardtc_svi_multiout.py @@ -0,0 +1,271 @@ +#from .posterior import Posterior +from GPy.util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv, dpotri +from GPy.util import diag +from GPy.core.parameterization.variational import VariationalPosterior +import numpy as np +from GPy.inference.latent_function_inference import LatentFunctionInference +from GPy.inference.latent_function_inference.posterior import Posterior +log_2_pi = np.log(2*np.pi) + + +class VarDTC_SVI_Multiout(LatentFunctionInference): + """ + An object for inference when the likelihood is Gaussian, but we want to do sparse inference. + + The function self.inference returns a Posterior object, which summarizes + the posterior. + + For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it. + + """ + const_jitter = 1e-6 + + def get_trYYT(self, Y): + return np.sum(np.square(Y)) + + def get_YYTfactor(self, Y): + N, D = Y.shape + if (N>=D): + return Y.view(np.ndarray) + else: + return jitchol(tdot(Y)) + + def gatherPsiStat(self, kern, X, Z, uncertain_inputs): + + if uncertain_inputs: + psi0 = kern.psi0(Z, X).sum() + psi1 = kern.psi1(Z, X) + psi2 = kern.psi2(Z, X) + else: + psi0 = kern.Kdiag(X).sum() + psi1 = kern.K(X, Z) + psi2 = tdot(psi1.T) + + return psi0, psi1, psi2 + + def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean ,qU_var_r, qU_var_c): + """ + The SVI-VarDTC inference + """ + + N, D, Mr, Mc, Qr, Qc = Y.shape[0], Y.shape[1], Zr.shape[0], Zc.shape[0], Zr.shape[1], Zc.shape[1] + + uncertain_inputs_r = isinstance(Xr, VariationalPosterior) + uncertain_inputs_c = isinstance(Xc, VariationalPosterior) + uncertain_outputs = isinstance(Y, VariationalPosterior) + + beta = 1./likelihood.variance + + psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) + psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) + + #====================================================================== + # Compute Common Components + #====================================================================== + + Kuu_r = kern_r.K(Zr).copy() + diag.add(Kuu_r, self.const_jitter) + Lr = jitchol(Kuu_r) + + Kuu_c = kern_c.K(Zc).copy() + diag.add(Kuu_c, self.const_jitter) + Lc = jitchol(Kuu_c) + + mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c + LSr = jitchol(Sr) + LSc = jitchol(Sc) + + LcInvMLrInvT = dtrtrs(Lc,dtrtrs(Lr,mu.T)[0].T)[0] + LcInvPsi2_cLcInvT = backsub_both_sides(Lc, psi2_c,'right') + LrInvPsi2_rLrInvT = backsub_both_sides(Lr, psi2_r,'right') + LcInvLSc = dtrtrs(Lc, LSc)[0] + LrInvLSr = dtrtrs(Lr, LSr)[0] + LcInvScLcInvT = tdot(LcInvLSc) + LrInvSrLrInvT = tdot(LrInvLSr) + LcInvPsi1_cT = dtrtrs(Lc, psi1_c.T)[0] + LrInvPsi1_rT = dtrtrs(Lr, psi1_r.T)[0] + + tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT = (LrInvPsi2_rLrInvT*LrInvSrLrInvT).sum() + tr_LcInvPsi2_cLcInvT_LcInvScLcInvT = (LcInvPsi2_cLcInvT*LcInvScLcInvT).sum() + tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() + tr_LcInvScLcInvT = np.square(LcInvLSc).sum() + tr_LrInvPsi2_rLrInvT = np.trace(LrInvPsi2_rLrInvT) + tr_LcInvPsi2_cLcInvT = np.trace(LcInvPsi2_cLcInvT) + + #====================================================================== + # Compute log-likelihood + #====================================================================== + + logL_A = - np.square(Y).sum() \ + - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \ + - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \ + + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \ + + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT + + logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A \ + -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ + - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. + + #====================================================================== + # Compute dL_dKuu + #====================================================================== + + tmp = beta* LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) \ + + beta* tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT.dot(LcInvScLcInvT) \ + - beta* LcInvMLrInvT.dot(LrInvPsi1_rT).dot(Y.T).dot(LcInvPsi1_cT.T) \ + - beta/2. * tr_LrInvPsi2_rLrInvT* LcInvPsi2_cLcInvT - Mr/2.*np.eye(Mc) \ + + tdot(LcInvMLrInvT)/2. + tr_LrInvSrLrInvT/2. * LcInvScLcInvT + + dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') + dL_dKuu_c += dL_dKuu_c.T + dL_dKuu_c *= 0.5 + + tmp = beta* LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT) \ + + beta* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT.dot(LrInvSrLrInvT) \ + - beta* LrInvPsi1_rT.dot(Y.T).dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT) \ + - beta/2. * tr_LcInvPsi2_cLcInvT * LrInvPsi2_rLrInvT - Mc/2.*np.eye(Mr) \ + + tdot(LcInvMLrInvT.T)/2. + tr_LcInvScLcInvT/2. * LrInvSrLrInvT + + dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') + dL_dKuu_r += dL_dKuu_r.T + dL_dKuu_r *= 0.5 + + #====================================================================== + # Compute dL_dthetaL + #====================================================================== + + dL_dthetaL = -D*N*beta/2. - logL_A*beta*beta/2. + + #====================================================================== + # Compute dL_dqU + #====================================================================== + + tmp = -beta * LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT)\ + + beta* LcInvPsi1_cT.dot(Y).dot(LrInvPsi1_rT.T) - LcInvMLrInvT + + dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] + + LScInv = dtrtri(LSc) + tmp = -beta/2.*tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT -tr_LrInvSrLrInvT/2.*np.eye(Mc) + dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr/2. + + LSrInv = dtrtri(LSr) + tmp = -beta/2.*tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT -tr_LcInvScLcInvT/2.*np.eye(Mr) + dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc/2. + + #====================================================================== + # Compute the Posterior distribution of inducing points p(u|Y) + #====================================================================== + + post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, + LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) + + #====================================================================== + # Compute dL_dpsi + #====================================================================== + + dL_dpsi0_r = - psi0_c * beta/2. * np.ones((D,)) + dL_dpsi0_c = - psi0_r * beta/2. * np.ones((N,)) + + dL_dpsi1_c = beta * dtrtrs(Lc, (Y.dot(LrInvPsi1_rT.T).dot(LcInvMLrInvT.T)).T, trans=1)[0].T + dL_dpsi1_r = beta * dtrtrs(Lr, (Y.T.dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT)).T, trans=1)[0].T + + tmp = beta/2.*(-LcInvMLrInvT.dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvScLcInvT + +tr_LrInvPsi2_rLrInvT *np.eye(Mc)) + dL_dpsi2_c = backsub_both_sides(Lc, tmp, 'left') + tmp = beta/2.*(-LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT) - tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvSrLrInvT + +tr_LcInvPsi2_cLcInvT *np.eye(Mr)) + dL_dpsi2_r = backsub_both_sides(Lr, tmp, 'left') + + if not uncertain_inputs_r: + dL_dpsi1_r += psi1_r.dot(dL_dpsi2_r+dL_dpsi2_r.T) + if not uncertain_inputs_c: + dL_dpsi1_c += psi1_c.dot(dL_dpsi2_c+dL_dpsi2_c.T) + + grad_dict = { + 'dL_dthetaL':dL_dthetaL, + 'dL_dqU_mean':dL_dqU_mean, + 'dL_dqU_var_c':dL_dqU_var_c, + 'dL_dqU_var_r':dL_dqU_var_r, + 'dL_dKuu_c': dL_dKuu_c, + 'dL_dKuu_r': dL_dKuu_r, + } + + if uncertain_inputs_c: + grad_dict['dL_dpsi0_c'] = dL_dpsi0_c + grad_dict['dL_dpsi1_c'] = dL_dpsi1_c + grad_dict['dL_dpsi2_c'] = dL_dpsi2_c + else: + grad_dict['dL_dKdiag_c'] = dL_dpsi0_c + grad_dict['dL_dKfu_c'] = dL_dpsi1_c + + if uncertain_inputs_r: + grad_dict['dL_dpsi0_r'] = dL_dpsi0_r + grad_dict['dL_dpsi1_r'] = dL_dpsi1_r + grad_dict['dL_dpsi2_r'] = dL_dpsi2_r + else: + grad_dict['dL_dKdiag_r'] = dL_dpsi0_r + grad_dict['dL_dKfu_r'] = dL_dpsi1_r + + return post, logL, grad_dict + +class PosteriorMultioutput(object): + + def __init__(self,LcInvMLrInvT, LcInvScLcInvT, LrInvSrLrInvT, Lr, Lc, kern_r, Xr, Zr): + self.LcInvMLrInvT = LcInvMLrInvT + self.LcInvScLcInvT = LcInvScLcInvT + self.LrInvSrLrInvT = LrInvSrLrInvT + self.Lr = Lr + self.Lc = Lc + self.kern_r = kern_r + self.Xr = Xr + self.Zr = Zr + + def _prepare(self): + D, Mr, Mc = self.Xr.shape[0], self.Zr.shape[0], self.LcInvMLrInvT.shape[0] + psi2_r_n = self.kern_r.psi2n(self.Zr, self.Xr) + psi0_r = self.kern_r.psi0(self.Zr, self.Xr) + psi1_r = self.kern_r.psi1(self.Zr, self.Xr) + + LrInvPsi1_rT = dtrtrs(self.Lr, psi1_r.T)[0] + self.woodbury_vector = self.LcInvMLrInvT.dot(LrInvPsi1_rT) + + LrInvPsi2_r_nLrInvT = dtrtrs(self.Lr, np.swapaxes((dtrtrs(self.Lr, psi2_r_n.reshape(D*Mr,Mr).T)[0].T).reshape(D,Mr,Mr),1,2).reshape(D*Mr,Mr).T)[0].T.reshape(D,Mr,Mr) + + tr_LrInvPsi2_r_nLrInvT = LrInvPsi2_r_nLrInvT.reshape(D,Mr*Mr).sum(1) + tr_LrInvPsi2_r_nLrInvT_LrInvSrLrInvT = LrInvPsi2_r_nLrInvT.reshape(D,Mr*mr).dot(self.LrInvSrLrInvT.flat) + + tmp = LrInvPsi2_r_nLrInvT - LrInvPsi1_rT.T[:,:,None]*LrInvPsi1_rT.T[:,None,:] + tmp = np.swapaxes(tmp.reshape(D*Mr,Mr).dot(self.LcInvMLrInvT.T).reshape(D,Mr,Mc), 1,2).reshape(D*Mc,Mr).dot(self.LcInvMLrInvT.T).reshape(D,Mc,Mc) + + def _raw_predict(self, kern, Xnew, pred_var, full_cov=False): + + N = Xnew.shape[0] + psi1_c = kern.K(Xnew, pred_var) + psi0_c = kern.Kdiag(Xnew) + LcInvPsi1_cT = dtrtrs(self.Lc, psi1_c.T)[0] + + D, Mr, Mc = self.Xr.shape[0], self.Zr.shape[0], self.LcInvMLrInvT.shape[0] + psi2_r_n = self.kern_r.psi2n(self.Zr, self.Xr) + psi0_r = self.kern_r.psi0(self.Zr, self.Xr) + psi1_r = self.kern_r.psi1(self.Zr, self.Xr) + + LrInvPsi1_rT = dtrtrs(self.Lr, psi1_r.T)[0] + woodbury_vector = self.LcInvMLrInvT.dot(LrInvPsi1_rT) + + mu = np.dot(LcInvPsi1_cT.T, woodbury_vector) + + LrInvPsi2_r_nLrInvT = dtrtrs(self.Lr, np.swapaxes((dtrtrs(self.Lr, psi2_r_n.reshape(D*Mr,Mr).T)[0].T).reshape(D,Mr,Mr),1,2).reshape(D*Mr,Mr).T)[0].T.reshape(D,Mr,Mr) + + tr_LrInvPsi2_r_nLrInvT = np.diagonal(LrInvPsi2_r_nLrInvT,axis1=1,axis2=2).sum(1) + tr_LrInvPsi2_r_nLrInvT_LrInvSrLrInvT = LrInvPsi2_r_nLrInvT.reshape(D,Mr*Mr).dot(self.LrInvSrLrInvT.flat) + + tmp = LrInvPsi2_r_nLrInvT - LrInvPsi1_rT.T[:,:,None]*LrInvPsi1_rT.T[:,None,:] + tmp = np.swapaxes(tmp.reshape(D*Mr,Mr).dot(self.LcInvMLrInvT.T).reshape(D,Mr,Mc), 1,2).reshape(D*Mc,Mr).dot(self.LcInvMLrInvT.T).reshape(D,Mc,Mc) + + var1 = (tmp.reshape(D*Mc,Mc).dot(LcInvPsi1_cT).reshape(D,Mc,N)*LcInvPsi1_cT[None,:,:]).sum(1).T + var2 = psi0_c[:,None]*psi0_r[None,:] + var3 = tr_LrInvPsi2_r_nLrInvT[None,:]*np.square(LcInvPsi1_cT).sum(0)[:,None] + var4 = tr_LrInvPsi2_r_nLrInvT_LrInvSrLrInvT[None,:]* (self.LcInvScLcInvT.dot(LcInvPsi1_cT)*LcInvPsi1_cT).sum(0)[:,None] + var = var1+var2-var3+var4 + return mu, var diff --git a/GPy/inference/latent_function_inference/vardtc_svi_multiout_miss.py b/GPy/inference/latent_function_inference/vardtc_svi_multiout_miss.py new file mode 100644 index 00000000..dc4d24c3 --- /dev/null +++ b/GPy/inference/latent_function_inference/vardtc_svi_multiout_miss.py @@ -0,0 +1,306 @@ +#from .posterior import Posterior +from GPy.util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri,pdinv, dpotri +from GPy.util import diag +from GPy.core.parameterization.variational import VariationalPosterior +import numpy as np +from GPy.inference.latent_function_inference import LatentFunctionInference +from GPy.inference.latent_function_inference.posterior import Posterior +from .vardtc_svi_multiout import PosteriorMultioutput +log_2_pi = np.log(2*np.pi) + + +class VarDTC_SVI_Multiout_Miss(LatentFunctionInference): + """ + """ + const_jitter = 1e-6 + + def get_trYYT(self, Y): + return np.sum(np.square(Y)) + + def get_YYTfactor(self, Y): + N, D = Y.shape + if (N>=D): + return Y.view(np.ndarray) + else: + return jitchol(tdot(Y)) + + def gatherPsiStat(self, kern, X, Z, uncertain_inputs): + + if uncertain_inputs: + psi0 = kern.psi0(Z, X) + psi1 = kern.psi1(Z, X) + psi2 = kern.psi2n(Z, X) + else: + psi0 = kern.Kdiag(X) + psi1 = kern.K(X, Z) + psi2 = psi1[:,:,None]*psi1[:,None,:] + + return psi0, psi1, psi2 + + def _init_grad_dict(self, N, D, Mr, Mc): + grad_dict = { + 'dL_dthetaL': np.zeros(D), + 'dL_dqU_mean': np.zeros((Mc,Mr)), + 'dL_dqU_var_c':np.zeros((Mc,Mc)), + 'dL_dqU_var_r':np.zeros((Mr,Mr)), + 'dL_dKuu_c': np.zeros((Mc,Mc)), + 'dL_dKuu_r': np.zeros((Mr,Mr)), + 'dL_dpsi0_c': np.zeros(N), + 'dL_dpsi1_c': np.zeros((N,Mc)), + 'dL_dpsi2_c': np.zeros((N,Mc,Mc)), + 'dL_dpsi0_r': np.zeros(D), + 'dL_dpsi1_r': np.zeros((D,Mr)), + 'dL_dpsi2_r': np.zeros((D,Mr,Mr)), + } + return grad_dict + + def inference_d(self, d, beta, Y, indexD, grad_dict, mid_res, uncertain_inputs_r, uncertain_inputs_c, Mr, Mc): + + idx_d = indexD==d + Y = Y[idx_d] + N, D = Y.shape[0], 1 + beta = beta[d] + + psi0_r, psi1_r, psi2_r = mid_res['psi0_r'], mid_res['psi1_r'], mid_res['psi2_r'] + psi0_c, psi1_c, psi2_c = mid_res['psi0_c'], mid_res['psi1_c'], mid_res['psi2_c'] + psi0_r, psi1_r, psi2_r = psi0_r[d], psi1_r[d:d+1], psi2_r[d] + psi0_c, psi1_c, psi2_c = psi0_c[idx_d].sum(), psi1_c[idx_d], psi2_c[idx_d].sum(0) + + Lr = mid_res['Lr'] + Lc = mid_res['Lc'] + LcInvMLrInvT = mid_res['LcInvMLrInvT'] + LcInvScLcInvT = mid_res['LcInvScLcInvT'] + LrInvSrLrInvT = mid_res['LrInvSrLrInvT'] + + + LcInvPsi2_cLcInvT = backsub_both_sides(Lc, psi2_c,'right') + LrInvPsi2_rLrInvT = backsub_both_sides(Lr, psi2_r,'right') + LcInvPsi1_cT = dtrtrs(Lc, psi1_c.T)[0] + LrInvPsi1_rT = dtrtrs(Lr, psi1_r.T)[0] + + tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT = (LrInvPsi2_rLrInvT*LrInvSrLrInvT).sum() + tr_LcInvPsi2_cLcInvT_LcInvScLcInvT = (LcInvPsi2_cLcInvT*LcInvScLcInvT).sum() + tr_LrInvPsi2_rLrInvT = np.trace(LrInvPsi2_rLrInvT) + tr_LcInvPsi2_cLcInvT = np.trace(LcInvPsi2_cLcInvT) + + logL_A = - np.square(Y).sum() \ + - (LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT)*LrInvPsi2_rLrInvT).sum() \ + - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT \ + + 2 * (Y * LcInvPsi1_cT.T.dot(LcInvMLrInvT).dot(LrInvPsi1_rT)).sum() - psi0_c * psi0_r \ + + tr_LrInvPsi2_rLrInvT * tr_LcInvPsi2_cLcInvT + + logL = -N*D/2.*(np.log(2.*np.pi)-np.log(beta)) + beta/2.* logL_A + + # ======= Gradients ===== + + tmp = beta* LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) \ + + beta* tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT.dot(LcInvScLcInvT) \ + - beta* LcInvMLrInvT.dot(LrInvPsi1_rT).dot(Y.T).dot(LcInvPsi1_cT.T) \ + - beta/2. * tr_LrInvPsi2_rLrInvT* LcInvPsi2_cLcInvT + + dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') + dL_dKuu_c += dL_dKuu_c.T + dL_dKuu_c *= 0.5 + + tmp = beta* LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT) \ + + beta* tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT.dot(LrInvSrLrInvT) \ + - beta* LrInvPsi1_rT.dot(Y.T).dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT) \ + - beta/2. * tr_LcInvPsi2_cLcInvT * LrInvPsi2_rLrInvT + + dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') + dL_dKuu_r += dL_dKuu_r.T + dL_dKuu_r *= 0.5 + + #====================================================================== + # Compute dL_dthetaL + #====================================================================== + + dL_dthetaL = -D*N*beta/2. - logL_A*beta*beta/2. + + #====================================================================== + # Compute dL_dqU + #====================================================================== + + tmp = -beta * LcInvPsi2_cLcInvT.dot(LcInvMLrInvT).dot(LrInvPsi2_rLrInvT)\ + + beta* LcInvPsi1_cT.dot(Y).dot(LrInvPsi1_rT.T) + + dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] + + tmp = -beta/2.*tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvPsi2_cLcInvT + dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + + tmp = -beta/2.*tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvPsi2_rLrInvT + dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + + #====================================================================== + # Compute dL_dpsi + #====================================================================== + + dL_dpsi0_r = - psi0_c * beta/2. * np.ones((D,)) + dL_dpsi0_c = - psi0_r * beta/2. * np.ones((N,)) + + dL_dpsi1_c = beta * dtrtrs(Lc, (Y.dot(LrInvPsi1_rT.T).dot(LcInvMLrInvT.T)).T, trans=1)[0].T + dL_dpsi1_r = beta * dtrtrs(Lr, (Y.T.dot(LcInvPsi1_cT.T).dot(LcInvMLrInvT)).T, trans=1)[0].T + + tmp = beta/2.*(-LcInvMLrInvT.dot(LrInvPsi2_rLrInvT).dot(LcInvMLrInvT.T) - tr_LrInvPsi2_rLrInvT_LrInvSrLrInvT * LcInvScLcInvT + +tr_LrInvPsi2_rLrInvT *np.eye(Mc)) + dL_dpsi2_c = backsub_both_sides(Lc, tmp, 'left') + tmp = beta/2.*(-LcInvMLrInvT.T.dot(LcInvPsi2_cLcInvT).dot(LcInvMLrInvT) - tr_LcInvPsi2_cLcInvT_LcInvScLcInvT * LrInvSrLrInvT + +tr_LcInvPsi2_cLcInvT *np.eye(Mr)) + dL_dpsi2_r = backsub_both_sides(Lr, tmp, 'left') + + grad_dict['dL_dthetaL'][d:d+1] = dL_dthetaL + grad_dict['dL_dqU_mean'] += dL_dqU_mean + grad_dict['dL_dqU_var_c'] += dL_dqU_var_c + grad_dict['dL_dqU_var_r'] += dL_dqU_var_r + grad_dict['dL_dKuu_c'] += dL_dKuu_c + grad_dict['dL_dKuu_r'] += dL_dKuu_r + + # if not uncertain_inputs_r: + # dL_dpsi1_r += (dL_dpsi2_r * psi1_r[:,:,None]).sum(1) + (dL_dpsi2_r * psi1_r[:,None,:]).sum(2) + # if not uncertain_inputs_c: + # dL_dpsi1_c += (dL_dpsi2_c * psi1_c[:,:,None]).sum(1) + (dL_dpsi2_c * psi1_c[:,None,:]).sum(2) + + if not uncertain_inputs_r: + dL_dpsi1_r += psi1_r.dot(dL_dpsi2_r+dL_dpsi2_r.T) + if not uncertain_inputs_c: + dL_dpsi1_c += psi1_c.dot(dL_dpsi2_c+dL_dpsi2_c.T) + + grad_dict['dL_dpsi0_c'][idx_d] += dL_dpsi0_c + grad_dict['dL_dpsi1_c'][idx_d] += dL_dpsi1_c + grad_dict['dL_dpsi2_c'][idx_d] += dL_dpsi2_c + + grad_dict['dL_dpsi0_r'][d:d+1] += dL_dpsi0_r + grad_dict['dL_dpsi1_r'][d:d+1] += dL_dpsi1_r + grad_dict['dL_dpsi2_r'][d] += dL_dpsi2_r + + + return logL + + + def inference(self, kern_r, kern_c, Xr, Xc, Zr, Zc, likelihood, Y, qU_mean ,qU_var_r, qU_var_c, indexD, output_dim): + """ + The SVI-VarDTC inference + """ + + N, D, Mr, Mc, Qr, Qc = Y.shape[0], output_dim,Zr.shape[0], Zc.shape[0], Zr.shape[1], Zc.shape[1] + + uncertain_inputs_r = isinstance(Xr, VariationalPosterior) + uncertain_inputs_c = isinstance(Xc, VariationalPosterior) + uncertain_outputs = isinstance(Y, VariationalPosterior) + + grad_dict = self._init_grad_dict(N,D,Mr,Mc) + + beta = 1./likelihood.variance + if len(beta)==1: + beta = np.zeros(D)+beta + + psi0_r, psi1_r, psi2_r = self.gatherPsiStat(kern_r, Xr, Zr, uncertain_inputs_r) + psi0_c, psi1_c, psi2_c = self.gatherPsiStat(kern_c, Xc, Zc, uncertain_inputs_c) + + #====================================================================== + # Compute Common Components + #====================================================================== + + Kuu_r = kern_r.K(Zr).copy() + diag.add(Kuu_r, self.const_jitter) + Lr = jitchol(Kuu_r) + + Kuu_c = kern_c.K(Zc).copy() + diag.add(Kuu_c, self.const_jitter) + Lc = jitchol(Kuu_c) + + mu, Sr, Sc = qU_mean, qU_var_r, qU_var_c + LSr = jitchol(Sr) + LSc = jitchol(Sc) + + LcInvMLrInvT = dtrtrs(Lc,dtrtrs(Lr,mu.T)[0].T)[0] + LcInvLSc = dtrtrs(Lc, LSc)[0] + LrInvLSr = dtrtrs(Lr, LSr)[0] + LcInvScLcInvT = tdot(LcInvLSc) + LrInvSrLrInvT = tdot(LrInvLSr) + tr_LrInvSrLrInvT = np.square(LrInvLSr).sum() + tr_LcInvScLcInvT = np.square(LcInvLSc).sum() + + mid_res = { + 'psi0_r': psi0_r, + 'psi1_r': psi1_r, + 'psi2_r': psi2_r, + 'psi0_c': psi0_c, + 'psi1_c': psi1_c, + 'psi2_c': psi2_c, + 'Lr':Lr, + 'Lc':Lc, + 'LcInvMLrInvT': LcInvMLrInvT, + 'LcInvScLcInvT': LcInvScLcInvT, + 'LrInvSrLrInvT': LrInvSrLrInvT, + } + + #====================================================================== + # Compute log-likelihood + #====================================================================== + + logL = 0. + for d in range(D): + logL += self.inference_d(d, beta, Y, indexD, grad_dict, mid_res, uncertain_inputs_r, uncertain_inputs_c, Mr, Mc) + + logL += -Mc * (np.log(np.diag(Lr)).sum()-np.log(np.diag(LSr)).sum()) -Mr * (np.log(np.diag(Lc)).sum()-np.log(np.diag(LSc)).sum()) \ + - np.square(LcInvMLrInvT).sum()/2. - tr_LrInvSrLrInvT * tr_LcInvScLcInvT/2. + Mr*Mc/2. + + #====================================================================== + # Compute dL_dKuu + #====================================================================== + + tmp = tdot(LcInvMLrInvT)/2. + tr_LrInvSrLrInvT/2. * LcInvScLcInvT - Mr/2.*np.eye(Mc) + + dL_dKuu_c = backsub_both_sides(Lc, tmp, 'left') + dL_dKuu_c += dL_dKuu_c.T + dL_dKuu_c *= 0.5 + + tmp = tdot(LcInvMLrInvT.T)/2. + tr_LcInvScLcInvT/2. * LrInvSrLrInvT - Mc/2.*np.eye(Mr) + + dL_dKuu_r = backsub_both_sides(Lr, tmp, 'left') + dL_dKuu_r += dL_dKuu_r.T + dL_dKuu_r *= 0.5 + + #====================================================================== + # Compute dL_dqU + #====================================================================== + + tmp = - LcInvMLrInvT + dL_dqU_mean = dtrtrs(Lc, dtrtrs(Lr, tmp.T, trans=1)[0].T, trans=1)[0] + + LScInv = dtrtri(LSc) + tmp = -tr_LrInvSrLrInvT/2.*np.eye(Mc) + dL_dqU_var_c = backsub_both_sides(Lc, tmp, 'left') + tdot(LScInv.T) * Mr/2. + + LSrInv = dtrtri(LSr) + tmp = -tr_LcInvScLcInvT/2.*np.eye(Mr) + dL_dqU_var_r = backsub_both_sides(Lr, tmp, 'left') + tdot(LSrInv.T) * Mc/2. + + #====================================================================== + # Compute the Posterior distribution of inducing points p(u|Y) + #====================================================================== + + post = PosteriorMultioutput(LcInvMLrInvT=LcInvMLrInvT, LcInvScLcInvT=LcInvScLcInvT, + LrInvSrLrInvT=LrInvSrLrInvT, Lr=Lr, Lc=Lc, kern_r=kern_r, Xr=Xr, Zr=Zr) + + #====================================================================== + # Compute dL_dpsi + #====================================================================== + + grad_dict['dL_dqU_mean'] += dL_dqU_mean + grad_dict['dL_dqU_var_c'] += dL_dqU_var_c + grad_dict['dL_dqU_var_r'] += dL_dqU_var_r + grad_dict['dL_dKuu_c'] += dL_dKuu_c + grad_dict['dL_dKuu_r'] += dL_dKuu_r + + if not uncertain_inputs_c: + grad_dict['dL_dKdiag_c'] = grad_dict['dL_dpsi0_c'] + grad_dict['dL_dKfu_c'] = grad_dict['dL_dpsi1_c'] + + if not uncertain_inputs_r: + grad_dict['dL_dKdiag_r'] = grad_dict['dL_dpsi0_r'] + grad_dict['dL_dKfu_r'] = grad_dict['dL_dpsi1_r'] + + return post, logL, grad_dict diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index 35d11cd7..ef138fc4 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -27,3 +27,5 @@ from .state_space_model import StateSpace from .ibp_lfm import IBPLFM from .gp_offset_regression import GPOffsetRegression from .gp_grid_regression import GPRegressionGrid +from .gp_multiout_regression import GPMultioutRegression +from .gp_multiout_regression_md import GPMultioutRegressionMD diff --git a/GPy/models/gp_multiout_regression.py b/GPy/models/gp_multiout_regression.py new file mode 100644 index 00000000..193d9726 --- /dev/null +++ b/GPy/models/gp_multiout_regression.py @@ -0,0 +1,175 @@ +# Copyright (c) 2017 Zhenwen Dai +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from ..core import SparseGP +from .. import likelihoods +from .. import kern +from .. import util +from GPy.core.parameterization.variational import NormalPosterior, NormalPrior +from ..core.parameterization.param import Param +from paramz.transformations import Logexp +from ..util.linalg import tdot + +class GPMultioutRegression(SparseGP): + """ + Gaussian Process model for scalable multioutput regression + + This is a thin wrapper around the models.GP class, with a set of sensible defaults + + :param X_list: list of input observations corresponding to each output + :type X_list: list of numpy arrays + :param Y_list: list of observed values related to the different noise models + :type Y_list: list of numpy arrays + :param kernel: a GPy kernel ** Coregionalized, defaults to RBF ** Coregionalized + :type kernel: None | GPy.kernel defaults + :likelihoods_list: a list of likelihoods, defaults to list of Gaussian likelihoods + :type likelihoods_list: None | a list GPy.likelihoods + :param name: model name + :type name: string + :param W_rank: number tuples of the corregionalization parameters 'W' (see coregionalize kernel documentation) + :type W_rank: integer + :param kernel_name: name of the kernel + :type kernel_name: string + """ + def __init__(self, X, Y, Xr_dim, kernel=None, kernel_row=None, likelihood=None, Z=None, Z_row=None, X_row=None, Xvariance_row=None, num_inducing=(10,10), qU_var_r_W_dim=None, qU_var_c_W_dim=None, init='GP', name='GPMR'): + + #Kernel + if kernel is None: + kernel = kern.RBF(X.shape[1]) + + if kernel_row is None: + kernel_row = kern.RBF(Xr_dim,name='kern_row') + + if init=='GP': + from . import SparseGPRegression, BayesianGPLVM + from ..util.linalg import jitchol + Mc, Mr = num_inducing + print('Intializing with GP...') + print('Fit Sparse GP...') + m_sgp = SparseGPRegression(X,Y,kernel=kernel.copy(),num_inducing=Mc) + m_sgp.likelihood.variance[:] = Y.var()*0.01 + m_sgp.optimize(max_iters=1000) + print('Fit BGPLVM...') + m_lvm = BayesianGPLVM(m_sgp.posterior.mean.copy().T,Xr_dim,kernel=kernel_row.copy(), num_inducing=Mr) + m_lvm.likelihood.variance[:] = m_lvm.Y.var()*0.01 + m_lvm.optimize(max_iters=10000) + + kernel[:] = m_sgp.kern.param_array.copy() + kernel.variance[:] = np.sqrt(kernel.variance) + Z = m_sgp.Z.values.copy() + kernel_row[:] = m_lvm.kern.param_array.copy() + kernel_row.variance[:] = np.sqrt(kernel_row.variance) + Z_row = m_lvm.Z.values.copy() + X_row = m_lvm.X.mean.values.copy() + Xvariance_row = m_lvm.X.variance.values + + qU_mean = m_lvm.posterior.mean.T.copy() + qU_var_col_W = jitchol(m_sgp.posterior.covariance) + qU_var_col_diag = np.full(Mc,1e-5) + qU_var_row_W = jitchol(m_lvm.posterior.covariance) + qU_var_row_diag = np.full(Mr,1e-5) + print('Done.') + else: + qU_mean = np.zeros(num_inducing) + qU_var_col_W = np.random.randn(num_inducing[0],num_inducing[0] if qU_var_c_W_dim is None else qU_var_c_W_dim)*0.01 + qU_var_col_diag = np.full(num_inducing[0],1e-5) + qU_var_row_W = np.random.randn(num_inducing[1],num_inducing[1] if qU_var_r_W_dim is None else qU_var_r_W_dim)*0.01 + qU_var_row_diag = np.full(num_inducing[1],1e-5) + + if X_row is None: + u,s,v = np.linalg.svd(Y) + X_row = Y.T.dot(u[:,:Xr_dim])#*np.sqrt(s)[:Xr_dim]) + X_row = X_row/X_row.std(0) + if Xvariance_row is None: + Xvariance_row = np.ones((Y.shape[1],Xr_dim))*0.0001 + if Z is None: + Z = X[np.random.permutation(X.shape[0])[:num_inducing[0]]].copy() + if Z_row is None: + Z_row = X_row[np.random.permutation(X_row.shape[0])[:num_inducing[1]]].copy() + + self.kern_row = kernel_row + self.X_row = NormalPosterior(X_row, Xvariance_row,name='Xr') + self.Z_row = Param('Zr', Z_row) + self.variational_prior_row = NormalPrior() + + self.qU_mean = Param('qU_mean', qU_mean) + self.qU_var_c_W = Param('qU_var_col_W', qU_var_col_W) + self.qU_var_c_diag = Param('qU_var_col_diag', qU_var_col_diag, Logexp()) + self.qU_var_r_W = Param('qU_var_row_W',qU_var_row_W) + self.qU_var_r_diag = Param('qU_var_row_diag', qU_var_row_diag, Logexp()) + + #Likelihood + likelihood = likelihoods.Gaussian(variance=np.var(Y)*0.01) + from ..inference.latent_function_inference import VarDTC_SVI_Multiout + inference_method = VarDTC_SVI_Multiout() + + super(GPMultioutRegression,self).__init__(X, Y, Z, kernel, likelihood=likelihood, + name=name, inference_method=inference_method) + + self.link_parameters(self.kern_row, self.X_row, self.Z_row,self.qU_mean, self.qU_var_c_W, self.qU_var_c_diag, self.qU_var_r_W, self.qU_var_r_diag) + + self._log_marginal_likelihood = np.nan + + def parameters_changed(self): + qU_var_c = tdot(self.qU_var_c_W) + np.diag(self.qU_var_c_diag) + qU_var_r = tdot(self.qU_var_r_W) + np.diag(self.qU_var_r_diag) + self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern_row, self.kern, self.X_row, self.X, self.Z_row, self.Z, self.likelihood, self.Y, self.qU_mean ,qU_var_r, qU_var_c) + + self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) + self.qU_mean.gradient[:] = self.grad_dict['dL_dqU_mean'] + self.qU_var_c_diag.gradient[:] = np.diag(self.grad_dict['dL_dqU_var_c']) + self.qU_var_c_W.gradient[:] = (self.grad_dict['dL_dqU_var_c']+self.grad_dict['dL_dqU_var_c'].T).dot(self.qU_var_c_W) + self.qU_var_r_diag.gradient[:] = np.diag(self.grad_dict['dL_dqU_var_r']) + self.qU_var_r_W.gradient[:] = (self.grad_dict['dL_dqU_var_r']+self.grad_dict['dL_dqU_var_r'].T).dot(self.qU_var_r_W) + + self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag_c'], self.X) + kerngrad = self.kern.gradient.copy() + self.kern.update_gradients_full(self.grad_dict['dL_dKfu_c'], self.X, self.Z) + kerngrad += self.kern.gradient + self.kern.update_gradients_full(self.grad_dict['dL_dKuu_c'], self.Z, None) + self.kern.gradient += kerngrad + #gradients wrt Z + self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKuu_c'], self.Z) + self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKfu_c'].T, self.Z, self.X) + + + #gradients wrt kernel + self.kern_row.update_gradients_full(self.grad_dict['dL_dKuu_r'], self.Z_row, None) + kerngrad = self.kern_row.gradient.copy() + self.kern_row.update_gradients_expectations(variational_posterior=self.X_row, + Z=self.Z_row, + dL_dpsi0=self.grad_dict['dL_dpsi0_r'], + dL_dpsi1=self.grad_dict['dL_dpsi1_r'], + dL_dpsi2=self.grad_dict['dL_dpsi2_r']) + self.kern_row.gradient += kerngrad + + #gradients wrt Z + self.Z_row.gradient = self.kern_row.gradients_X(self.grad_dict['dL_dKuu_r'], self.Z_row) + self.Z_row.gradient += self.kern_row.gradients_Z_expectations( + self.grad_dict['dL_dpsi0_r'], + self.grad_dict['dL_dpsi1_r'], + self.grad_dict['dL_dpsi2_r'], + Z=self.Z_row, + variational_posterior=self.X_row) + + self._log_marginal_likelihood -= self.variational_prior_row.KL_divergence(self.X_row) + + self.X_row.mean.gradient, self.X_row.variance.gradient = self.kern_row.gradients_qX_expectations( + variational_posterior=self.X_row, + Z=self.Z_row, + dL_dpsi0=self.grad_dict['dL_dpsi0_r'], + dL_dpsi1=self.grad_dict['dL_dpsi1_r'], + dL_dpsi2=self.grad_dict['dL_dpsi2_r']) + + self.variational_prior_row.update_gradients_KL(self.X_row) + + def optimize_auto(self,max_iters=10000,verbose=True): + self.Z.fix(warning=False) + self.kern.fix(warning=False) + self.kern_row.fix(warning=False) + self.Zr.fix(warning=False) + self.Xr.fix(warning=False) + self.optimize(max_iters=int(0.1*max_iters),messages=verbose) + self.unfix() + self.optimize(max_iters=max_iters,messages=verbose) diff --git a/GPy/models/gp_multiout_regression_md.py b/GPy/models/gp_multiout_regression_md.py new file mode 100644 index 00000000..0d912843 --- /dev/null +++ b/GPy/models/gp_multiout_regression_md.py @@ -0,0 +1,173 @@ +# Copyright (c) 2017 Zhenwen Dai +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from ..core import SparseGP +from .. import likelihoods +from .. import kern +from .. import util +from GPy.core.parameterization.variational import NormalPosterior, NormalPrior +from ..core.parameterization.param import Param +from paramz.transformations import Logexp +from ..util.linalg import tdot +from .sparse_gp_regression_md import SparseGPRegressionMD + +class GPMultioutRegressionMD(SparseGP): + """ + Gaussian Process model for scalable multioutput regression + + This is a thin wrapper around the models.GP class, with a set of sensible defaults + """ + def __init__(self, X, Y, indexD, Xr_dim, kernel=None, kernel_row=None, likelihood=None, Z=None, Z_row=None, X_row=None, Xvariance_row=None, num_inducing=(10,10), qU_var_r_W_dim=None, qU_var_c_W_dim=None, init='GP', heter_noise=False, name='GPMR'): + + assert len(Y.shape)==1 or Y.shape[1]==1 + + self.output_dim = int(np.max(indexD))+1 + self.heter_noise = heter_noise + self.indexD = indexD + + #Kernel + if kernel is None: + kernel = kern.RBF(X.shape[1]) + if kernel_row is None: + kernel_row = kern.RBF(Xr_dim,name='kern_row') + + if init=='GP': + from . import SparseGPRegression, BayesianGPLVM + from ..util.linalg import jitchol + Mc, Mr = num_inducing + print('Intializing with GP...') + print('Fit Sparse GP...') + m_sgp = SparseGPRegressionMD(X,Y,indexD,kernel=kernel.copy(),num_inducing=Mc) + m_sgp.likelihood.variance[:] = Y.var()*0.01 + m_sgp.optimize(max_iters=1000) + print('Fit BGPLVM...') + m_lvm = BayesianGPLVM(m_sgp.posterior.mean.copy().T,Xr_dim,kernel=kernel_row.copy(), num_inducing=Mr) + m_lvm.likelihood.variance[:] = m_lvm.Y.var()*0.01 + m_lvm.optimize(max_iters=10000) + + kernel[:] = m_sgp.kern.param_array.copy() + kernel.variance[:] = np.sqrt(kernel.variance) + Z = m_sgp.Z.values.copy() + kernel_row[:] = m_lvm.kern.param_array.copy() + kernel_row.variance[:] = np.sqrt(kernel_row.variance) + Z_row = m_lvm.Z.values.copy() + X_row = m_lvm.X.mean.values.copy() + Xvariance_row = m_lvm.X.variance.values + + qU_mean = m_lvm.posterior.mean.T.copy() + qU_var_col_W = jitchol(m_sgp.posterior.covariance) + qU_var_col_diag = np.full(Mc,1e-5) + qU_var_row_W = jitchol(m_lvm.posterior.covariance) + qU_var_row_diag = np.full(Mr,1e-5) + print('Done.') + else: + qU_mean = np.zeros(num_inducing) + qU_var_col_W = np.random.randn(num_inducing[0],num_inducing[0] if qU_var_c_W_dim is None else qU_var_c_W_dim)*0.01 + qU_var_col_diag = np.full(num_inducing[0],1e-5) + qU_var_row_W = np.random.randn(num_inducing[1],num_inducing[1] if qU_var_r_W_dim is None else qU_var_r_W_dim)*0.01 + qU_var_row_diag = np.full(num_inducing[1],1e-5) + + + if Z is None: + Z = X[np.random.permutation(X.shape[0])[:num_inducing[0]]].copy() + if X_row is None: + X_row = np.random.randn(self.output_dim,Xr_dim) + if Xvariance_row is None: + Xvariance_row = np.ones((self.output_dim,Xr_dim))*0.0001 + if Z_row is None: + Z_row = X_row[np.random.permutation(X_row.shape[0])[:num_inducing[1]]].copy() + + self.kern_row = kernel_row + self.X_row = NormalPosterior(X_row, Xvariance_row,name='Xr') + self.Z_row = Param('Zr', Z_row) + self.variational_prior_row = NormalPrior() + + self.qU_mean = Param('qU_mean', qU_mean) + self.qU_var_c_W = Param('qU_var_col_W', qU_var_col_W) + self.qU_var_c_diag = Param('qU_var_col_diag', qU_var_col_diag, Logexp()) + self.qU_var_r_W = Param('qU_var_row_W',qU_var_row_W) + self.qU_var_r_diag = Param('qU_var_row_diag', qU_var_row_diag, Logexp()) + + #Likelihood + if heter_noise: + likelihood = likelihoods.Gaussian(variance=np.array([np.var(Y[indexD==d]) for d in range(self.output_dim)])*0.01) + else: + likelihood = likelihoods.Gaussian(variance=np.var(Y)*0.01) + from ..inference.latent_function_inference.vardtc_svi_multiout_miss import VarDTC_SVI_Multiout_Miss + inference_method = VarDTC_SVI_Multiout_Miss() + + super(GPMultioutRegressionMD,self).__init__(X, Y, Z, kernel, likelihood=likelihood, + name=name, inference_method=inference_method) + self.output_dim = int(np.max(indexD))+1 + + self.link_parameters(self.kern_row, self.X_row, self.Z_row,self.qU_mean, self.qU_var_c_W, self.qU_var_c_diag, self.qU_var_r_W, self.qU_var_r_diag) + + self._log_marginal_likelihood = np.nan + + def parameters_changed(self): + qU_var_c = tdot(self.qU_var_c_W) + np.diag(self.qU_var_c_diag) + qU_var_r = tdot(self.qU_var_r_W) + np.diag(self.qU_var_r_diag) + self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern_row, self.kern, self.X_row, self.X, self.Z_row, self.Z, self.likelihood, self.Y, self.qU_mean ,qU_var_r, qU_var_c, self.indexD, self.output_dim) + # import pdb;pdb.set_trace() + + if self.heter_noise: + self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) + else: + self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'].sum()) + self.qU_mean.gradient[:] = self.grad_dict['dL_dqU_mean'] + self.qU_var_c_diag.gradient[:] = np.diag(self.grad_dict['dL_dqU_var_c']) + self.qU_var_c_W.gradient[:] = (self.grad_dict['dL_dqU_var_c']+self.grad_dict['dL_dqU_var_c'].T).dot(self.qU_var_c_W) + self.qU_var_r_diag.gradient[:] = np.diag(self.grad_dict['dL_dqU_var_r']) + self.qU_var_r_W.gradient[:] = (self.grad_dict['dL_dqU_var_r']+self.grad_dict['dL_dqU_var_r'].T).dot(self.qU_var_r_W) + + self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag_c'], self.X) + kerngrad = self.kern.gradient.copy() + self.kern.update_gradients_full(self.grad_dict['dL_dKfu_c'], self.X, self.Z) + kerngrad += self.kern.gradient + self.kern.update_gradients_full(self.grad_dict['dL_dKuu_c'], self.Z, None) + self.kern.gradient += kerngrad + #gradients wrt Z + self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKuu_c'], self.Z) + self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKfu_c'].T, self.Z, self.X) + + + #gradients wrt kernel + self.kern_row.update_gradients_full(self.grad_dict['dL_dKuu_r'], self.Z_row, None) + kerngrad = self.kern_row.gradient.copy() + self.kern_row.update_gradients_expectations(variational_posterior=self.X_row, + Z=self.Z_row, + dL_dpsi0=self.grad_dict['dL_dpsi0_r'], + dL_dpsi1=self.grad_dict['dL_dpsi1_r'], + dL_dpsi2=self.grad_dict['dL_dpsi2_r']) + self.kern_row.gradient += kerngrad + + #gradients wrt Z + self.Z_row.gradient = self.kern_row.gradients_X(self.grad_dict['dL_dKuu_r'], self.Z_row) + self.Z_row.gradient += self.kern_row.gradients_Z_expectations( + self.grad_dict['dL_dpsi0_r'], + self.grad_dict['dL_dpsi1_r'], + self.grad_dict['dL_dpsi2_r'], + Z=self.Z_row, + variational_posterior=self.X_row) + + self._log_marginal_likelihood -= self.variational_prior_row.KL_divergence(self.X_row) + + self.X_row.mean.gradient, self.X_row.variance.gradient = self.kern_row.gradients_qX_expectations( + variational_posterior=self.X_row, + Z=self.Z_row, + dL_dpsi0=self.grad_dict['dL_dpsi0_r'], + dL_dpsi1=self.grad_dict['dL_dpsi1_r'], + dL_dpsi2=self.grad_dict['dL_dpsi2_r']) + + self.variational_prior_row.update_gradients_KL(self.X_row) + + def optimize_auto(self,max_iters=10000,verbose=True): + self.Z.fix(warning=False) + self.kern.fix(warning=False) + self.kern_row.fix(warning=False) + self.Zr.fix(warning=False) + self.Xr.fix(warning=False) + self.optimize(max_iters=int(0.1*max_iters),messages=verbose) + self.unfix() + self.optimize(max_iters=max_iters,messages=verbose) diff --git a/GPy/models/sparse_gp_regression_md.py b/GPy/models/sparse_gp_regression_md.py new file mode 100644 index 00000000..5762c44c --- /dev/null +++ b/GPy/models/sparse_gp_regression_md.py @@ -0,0 +1,64 @@ +# Copyright (c) 2012, James Hensman +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from ..core.sparse_gp_mpi import SparseGP_MPI +from .. import likelihoods +from .. import kern +from ..inference.latent_function_inference.vardtc_md import VarDTC_MD +from GPy.core.parameterization.variational import NormalPosterior + +class SparseGPRegressionMD(SparseGP_MPI): + """ + Sparse Gaussian Process Regression with Missing Data + """ + + def __init__(self, X, Y, indexD, kernel=None, Z=None, num_inducing=10, X_variance=None, normalizer=None, mpi_comm=None, individual_Y_noise=False, name='sparse_gp'): + + assert len(Y.shape)==1 or Y.shape[1]==1 + self.individual_Y_noise = individual_Y_noise + self.indexD = indexD + output_dim = int(np.max(indexD))+1 + + num_data, input_dim = X.shape + + # kern defaults to rbf (plus white for stability) + if kernel is None: + kernel = kern.RBF(input_dim)# + kern.white(input_dim, variance=1e-3) + + # Z defaults to a subset of the data + if Z is None: + i = np.random.permutation(num_data)[:min(num_inducing, num_data)] + Z = X.view(np.ndarray)[i].copy() + else: + assert Z.shape[1] == input_dim + + if individual_Y_noise: + likelihood = likelihoods.Gaussian(variance=np.array([np.var(Y[indexD==d]) for d in range(output_dim)])*0.01) + else: + likelihood = likelihoods.Gaussian(variance=np.var(Y)*0.01) + + if not (X_variance is None): + X = NormalPosterior(X,X_variance) + + infr = VarDTC_MD() + + SparseGP_MPI.__init__(self, X, Y, Z, kernel, likelihood, inference_method=infr, normalizer=normalizer, mpi_comm=mpi_comm, name=name) + self.output_dim = output_dim + + def parameters_changed(self): + + self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y, self.indexD, self.output_dim, self.Y_metadata) + + self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'] if self.individual_Y_noise else self.grad_dict['dL_dthetaL'].sum()) + + self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X) + kerngrad = self.kern.gradient.copy() + self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z) + kerngrad += self.kern.gradient + self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None) + self.kern.gradient += kerngrad + #gradients wrt Z + self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X) diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py index fc56fe5d..68e95ec0 100644 --- a/GPy/testing/model_tests.py +++ b/GPy/testing/model_tests.py @@ -335,29 +335,29 @@ class MiscTests(unittest.TestCase): Y2 = np.random.normal(0, 1, (40, 6)) Y3 = np.random.normal(0, 1, (40, 8)) Q = 5 - m = GPy.models.MRD(dict(data1=Y1, data2=Y2, data3=Y3), Q, + m = GPy.models.MRD(dict(data1=Y1, data2=Y2, data3=Y3), Q, ) m.randomize() self.assertTrue(m.checkgrad()) - - m = GPy.models.MRD(dict(data1=Y1, data2=Y2, data3=Y3), Q, initx='PCA_single', + + m = GPy.models.MRD(dict(data1=Y1, data2=Y2, data3=Y3), Q, initx='PCA_single', initz='random', - kernel=[GPy.kern.RBF(Q, ARD=1) for _ in range(3)], + kernel=[GPy.kern.RBF(Q, ARD=1) for _ in range(3)], inference_method=InferenceMethodList([VarDTC() for _ in range(3)]), likelihoods = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(3)]) m.randomize() self.assertTrue(m.checkgrad()) - m = GPy.models.MRD(dict(data1=Y1, data2=Y2, data3=Y3), Q, initx='random', + m = GPy.models.MRD(dict(data1=Y1, data2=Y2, data3=Y3), Q, initx='random', initz='random', - kernel=GPy.kern.RBF(Q, ARD=1), + kernel=GPy.kern.RBF(Q, ARD=1), ) m.randomize() self.assertTrue(m.checkgrad()) - m = GPy.models.MRD(dict(data1=Y1, data2=Y2, data3=Y3), Q, X=np.random.normal(0,1,size=(40,Q)), + m = GPy.models.MRD(dict(data1=Y1, data2=Y2, data3=Y3), Q, X=np.random.normal(0,1,size=(40,Q)), X_variance=False, - kernel=GPy.kern.RBF(Q, ARD=1), + kernel=GPy.kern.RBF(Q, ARD=1), likelihoods = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(3)]) m.randomize() self.assertTrue(m.checkgrad()) @@ -961,6 +961,127 @@ class GradientTests(np.testing.TestCase): m.randomize() self.assertTrue(m.checkgrad()) + def test_multiout_regression(self): + np.random.seed(0) + import GPy + + N = 10 + N_train = 5 + D = 4 + noise_var = .3 + + k = GPy.kern.RBF(1,lengthscale=0.1) + x = np.random.rand(N,1) + cov = k.K(x) + + k_r = GPy.kern.RBF(2,lengthscale=.4) + x_r = np.random.rand(D,2) + cov_r = k_r.K(x_r) + + cov_all = np.kron(cov_r,cov) + L = GPy.util.linalg.jitchol(cov_all) + + y_latent = L.dot(np.random.randn(N*D)).reshape(D,N).T + + x_test = x[N_train:] + y_test = y_latent[N_train:] + x = x[:N_train] + y = y_latent[:N_train]+np.random.randn(N_train,D)*np.sqrt(noise_var) + + Mr = D + Mc = x.shape[0] + Qr = 5 + Qc = x.shape[1] + + m_mr = GPy.models.GPMultioutRegression(x,y,Xr_dim=Qr, kernel_row=GPy.kern.RBF(Qr,ARD=True), num_inducing=(Mc,Mr),init='GP') + m_mr.optimize_auto(max_iters=1) + m_mr.randomize() + self.assertTrue(m_mr.checkgrad()) + + m_mr = GPy.models.GPMultioutRegression(x,y,Xr_dim=Qr, kernel_row=GPy.kern.RBF(Qr,ARD=True), num_inducing=(Mc,Mr),init='rand') + m_mr.optimize_auto(max_iters=1) + m_mr.randomize() + self.assertTrue(m_mr.checkgrad()) + + def test_multiout_regression_md(self): + import GPy + np.random.seed(0) + + N = 20 + N_train = 5 + D = 8 + noise_var = 0.3 + + k = GPy.kern.RBF(1,lengthscale=0.1) + x_raw = np.random.rand(N*D,1) + + # dimension assignment + D_list = [] + for i in range(2): + while True: + D_sub_list = [] + ratios = [] + r_p = 0. + for j in range(3): + ratios.append(np.random.rand()*(1-r_p)+r_p) + D_sub_list.append(int((ratios[-1]-r_p)*4*N_train)) + r_p = ratios[-1] + D_sub_list.append(4*N_train - np.sum(D_sub_list)) + if (np.array(D_sub_list)!=0).all(): + D_list.extend([a+N-N_train for a in D_sub_list]) + break + + cov = k.K(x_raw) + + k_r = GPy.kern.RBF(2,lengthscale=.4) + x_r = np.random.rand(D,2) + cov_r = k_r.K(x_r) + + cov_all = np.repeat(np.repeat(cov_r,D_list,axis=0),D_list,axis=1)*cov + L = GPy.util.linalg.jitchol(cov_all) + + y_latent = L.dot(np.random.randn(N*D)) + + x = np.zeros((D*N_train,)) + y = np.zeros((D*N_train,)) + x_test = np.zeros((D*(N-N_train),)) + y_test = np.zeros((D*(N-N_train),)) + indexD = np.zeros((D*N_train),dtype=np.int) + indexD_test = np.zeros((D*(N-N_train)),dtype=np.int) + + offset_all = 0 + offset_train = 0 + offset_test = 0 + for i in range(D): + D_test = N-N_train + D_train = D_list[i] - N+N_train + y[offset_train:offset_train+D_train] = y_latent[offset_all:offset_all+D_train] + x[offset_train:offset_train+D_train] = x_raw[offset_all:offset_all+D_train,0] + y_test[offset_test:offset_test+D_test] = y_latent[offset_all+D_train:offset_all+D_train+D_test] + x_test[offset_test:offset_test+D_test] = x_raw[offset_all+D_train:offset_all+D_train+D_test,0] + indexD[offset_train:offset_train+D_train] = i + indexD_test[offset_test:offset_test+D_test] = i + offset_train += D_train + offset_test += D_test + offset_all += D_train+D_test + + y_noisefree = y.copy() + y += np.random.randn(*y.shape)*np.sqrt(noise_var) + x_flat = x.flatten()[:,None] + y_flat = y.flatten()[:,None] + + Mr, Mc, Qr, Qc = 4,3,2,1 + + m = GPy.models.GPMultioutRegressionMD(x_flat,y_flat,indexD,Xr_dim=Qr, kernel_row=GPy.kern.RBF(Qr,ARD=False), num_inducing=(Mc,Mr)) + m.optimize_auto(max_iters=1) + m.randomize() + self.assertTrue(m.checkgrad()) + + m = GPy.models.GPMultioutRegressionMD(x_flat,y_flat,indexD,Xr_dim=Qr, kernel_row=GPy.kern.RBF(Qr,ARD=False), num_inducing=(Mc,Mr),init='rand') + m.optimize_auto(max_iters=1) + m.randomize() + self.assertTrue(m.checkgrad()) + if __name__ == "__main__": print("Running unit tests, please be (very) patient...") unittest.main()