diff --git a/GPy/examples/sparse_GP_regression_demo.py b/GPy/examples/sparse_GP_regression_demo.py index 6424b8f2..d67a3831 100644 --- a/GPy/examples/sparse_GP_regression_demo.py +++ b/GPy/examples/sparse_GP_regression_demo.py @@ -11,8 +11,8 @@ import numpy as np import GPy np.random.seed(2) pb.ion() -N = 500 -M = 5 +N = 1200 +M = 20 ###################################### ## 1 dimensional example @@ -27,20 +27,21 @@ noise = GPy.kern.white(1) kernel = rbf + noise # create simple GP model -m1 = GPy.models.sparse_GP_regression(X, Y, kernel, M=M) +m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M) # contrain all parameters to be positive -m1.constrain_positive('(variance|lengthscale|precision)') -#m1.constrain_positive('(variance|lengthscale)') -#m1.constrain_fixed('prec',10.) +m.constrain_positive('(variance|lengthscale|precision)') +#m.constrain_positive('(variance|lengthscale)') +#m.constrain_fixed('prec',10.) #check gradient FIXME unit test please -m1.checkgrad() +m.checkgrad(verbose=1) +stop # optimize and plot -m1.optimize('tnc', messages = 1) -m1.plot() -# print(m1) +m.optimize('tnc', messages = 1) +m.plot() +# print(m) ###################################### ## 2 dimensional example diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py index 6313f4f7..6d1b1f44 100644 --- a/GPy/models/sparse_GP_regression.py +++ b/GPy/models/sparse_GP_regression.py @@ -37,7 +37,7 @@ class sparse_GP_regression(GP_regression): """ def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False): - self.stabilit_multiplier = 1. + self.scale_factor = 1e1 self.beta = beta if Z is None: self.Z = np.random.permutation(X.copy())[:M] @@ -60,14 +60,6 @@ class sparse_GP_regression(GP_regression): if self.has_uncertain_inputs: self.X_uncertainty /= np.square(self._Xstd) - def set_param(self, p): - self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) - self.beta = p[self.M*self.Q] - self.kern.set_param(p[self.Z.size + 1:]) - self.beta2 = self.beta**2 - self._compute_kernel_matrices() - self._computations() - def _compute_kernel_matrices(self): # kernel computations, using BGPLVM notation #TODO: slices for psi statistics (easy enough) @@ -77,35 +69,62 @@ class sparse_GP_regression(GP_regression): self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + raise NotImplementedError, "scale psi2 (in kern?)" else: self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() self.psi1 = self.kern.K(self.Z,self.X) - self.psi2 = np.dot(self.psi1,self.psi1.T) + #self.psi2 = np.dot(self.psi1,self.psi1.T) + #self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:] + self.psi1_scaled = self.psi1/self.scale_factor + #self.psi2_scaled = psi1_scaled.T[:,:,None]*psi1_scaled.T[:,None,:] + self.psi2_scaled = np.dot(self.psi1_scaled,self.psi1_scaled.T) def _computations(self): # TODO find routine to multiply triangular matrices - self.V = self.beta*self.Y + sf = self.scale_factor + sf2 = sf**2 + + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) + + self.V = (self.beta/self.scale_factor)*self.Y + self.A = mdot(self.Lmi, self.beta*self.psi2_scaled, self.Lmi.T) + self.B = np.eye(self.M)/sf2 + self.A + + self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) + self.psi1V = np.dot(self.psi1, self.V) self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T) - self.B = np.eye(self.M)*self.stabilit_multiplier + self.A - self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) - self.LLambdai = np.dot(self.LBi, self.Lmi) - self.trace_K = self.psi0 - np.trace(self.A)/self.beta - self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) - self.C = mdot(self.LLambdai, self.psi1V) - self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) + self.C = mdot(self.Lmi.T, self.Bi, self.Lmi) + self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T) # Compute dL_dpsi self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) - self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) - self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T + self.dL_dpsi2 = 0.5 * self.beta * self.D * self.Kmmi # dB + self.dL_dpsi2 += - 0.5 * self.beta/sf2 * self.D * self.C # dC + self.dL_dpsi2 += - 0.5 * self.beta * self.E # dD # Compute dL_dKmm - self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB - self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC - self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE + self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB + self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*self.beta*mdot(self.C, self.psi2_scaled, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.beta*self.psi2_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD + + def log_likelihood(self): + """ Compute the (lower bound on the) log marginal likelihood """ + sf2 = self.scale_factor**2 + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) -0.5*self.beta*self.trYYT + B = -0.5*self.D*(self.beta*self.psi0-np.trace(self.A)*sf2) + C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2)) + D = +0.5*np.sum(self.psi1VVpsi1 * self.C) + return A+B+C+D + + def set_param(self, p): + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) + self.beta = p[self.M*self.Q] + self.kern.set_param(p[self.Z.size + 1:]) + self.beta2 = self.beta**2 + self._compute_kernel_matrices() + self._computations() def get_param(self): return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()]) @@ -113,30 +132,19 @@ class sparse_GP_regression(GP_regression): def get_param_names(self): return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names() - def log_likelihood(self): - """ - Compute the (lower bound on the) log marginal likelihood - """ - A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) - B = -0.5*self.beta*self.D*self.trace_K - C = -0.5*self.D * self.B_logdet - D = -0.5*self.beta*self.trYYT - E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) - return A+B+C+D+E - def dL_dbeta(self): """ Compute the gradient of the log likelihood wrt beta. """ #TODO: suport heteroscedatic noise - dA_dbeta = 0.5 * self.N*self.D/self.beta - dB_dbeta = - 0.5 * self.D * self.trace_K + sf2 = self.scale_factor**2 + dA_dbeta = 0.5 * self.N*self.D/self.beta - 0.5 * self.trYYT + dB_dbeta = - 0.5 * self.D * self.psi0 - np.trace(self.A)/self.beta*sf2 dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta - dD_dbeta = - 0.5 * self.trYYT - tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V) - dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta + tmp = mdot(self.Bi, self.Lmi, self.psi1V) + dD_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta - return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta) + return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta) def dL_dtheta(self): """