diff --git a/GPy/core/model.py b/GPy/core/model.py index 3937a9cc..3d64441e 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -18,6 +18,7 @@ class model(parameterised): self.optimization_runs = [] self.sampling_runs = [] self.set_param(self.get_param()) + self.preferred_optimizer = 'tnc' def get_param(self): raise NotImplementedError, "this needs to be implemented to utilise the model class" def set_param(self,x): @@ -161,15 +162,18 @@ class model(parameterised): else: self.expand_param(initial_parameters) - def optimize(self, optimizer = 'tnc', start = None, **kwargs): + def optimize(self, optimizer=None, start=None, **kwargs): """ Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors. kwargs are passed to the optimizer. They can be: :max_f_eval: maximum number of function evaluations :messages: whether to display during optimisation - + :param optimzer: whice optimizer to use (defaults to self.preferred optimizer) + :type optimzer: string TODO: valid strings? """ + if optimizer is None: + optimizer = self.preferred_optimizer def f(x): self.expand_param(x) diff --git a/GPy/examples/sparse_GPLVM_demo.py b/GPy/examples/sparse_GPLVM_demo.py index af125d6e..6ca6c941 100644 --- a/GPy/examples/sparse_GPLVM_demo.py +++ b/GPy/examples/sparse_GPLVM_demo.py @@ -9,6 +9,7 @@ np.random.seed(1) print "sparse GPLVM with RBF kernel" N = 100 +M = 4 Q = 1 D = 2 #generate GPLVM-like data @@ -17,7 +18,7 @@ k = GPy.kern.rbf(Q, 1.0, 2.0) + GPy.kern.white(Q, 0.00001) K = k.K(X) Y = np.random.multivariate_normal(np.zeros(N),K,D).T -m = GPy.models.sparse_GPLVM(Y, Q, M = 10) +m = GPy.models.sparse_GPLVM(Y, Q, M=M) m.constrain_positive('(rbf|bias|noise)') m.constrain_bounded('white', 1e-3, 0.1) # m.plot() diff --git a/GPy/examples/sparse_GP_regression_demo.py b/GPy/examples/sparse_GP_regression_demo.py index 13ff47e0..6424b8f2 100644 --- a/GPy/examples/sparse_GP_regression_demo.py +++ b/GPy/examples/sparse_GP_regression_demo.py @@ -12,6 +12,7 @@ import GPy np.random.seed(2) pb.ion() N = 500 +M = 5 ###################################### ## 1 dimensional example @@ -26,7 +27,7 @@ noise = GPy.kern.white(1) kernel = rbf + noise # create simple GP model -m1 = GPy.models.sparse_GP_regression(X,Y,kernel, M = 10) +m1 = GPy.models.sparse_GP_regression(X, Y, kernel, M=M) # contrain all parameters to be positive m1.constrain_positive('(variance|lengthscale|precision)') diff --git a/GPy/inference/optimization.py b/GPy/inference/optimization.py index 7c575a94..c96d2a3b 100644 --- a/GPy/inference/optimization.py +++ b/GPy/inference/optimization.py @@ -23,8 +23,7 @@ class Optimizer(): :rtype: optimizer object. """ - def __init__(self, x_init, f_fp, f, fp , messages = False, max_f_eval = 1e4, ftol = None, gtol = None, xtol = None): - + def __init__(self, x_init, f_fp, f, fp , messages=False, max_f_eval=1e4, ftol=None, gtol=None, xtol=None): self.opt_name = None self.f_fp = f_fp self.f = f diff --git a/GPy/kern/Matern32.py b/GPy/kern/Matern32.py index c37bd5c0..8223b37a 100644 --- a/GPy/kern/Matern32.py +++ b/GPy/kern/Matern32.py @@ -39,11 +39,13 @@ class Matern32(kernpart): def get_param(self): """return the value of the parameters.""" return np.hstack((self.variance,self.lengthscales)) + def set_param(self,x): """set the value of the parameters.""" assert x.size==(self.D+1) self.variance = x[0] self.lengthscales = x[1:] + def get_param_names(self): """return parameter names.""" if self.D==1: @@ -56,10 +58,37 @@ class Matern32(kernpart): if X2 is None: X2 = X dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1)) np.add(self.variance*(1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist), target,target) + def Kdiag(self,X,target): """Compute the diagonal of the covariance matrix associated to X.""" np.add(target,self.variance,target) + def dK_dtheta(self,partial,X,X2,target): + """derivative of the covariance matrix with respect to the parameters.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1)) + dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist) + invdist = 1./np.where(dist!=0.,dist,np.inf) + dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3 + dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis] + target[0] += np.sum(dvar*partial) + target[1:] += (dl*partial[:,:,None]).sum(0).sum(0) + + def dKdiag_dtheta(self,partial,X,target): + """derivative of the diagonal of the covariance matrix with respect to the parameters.""" + target[0] += np.sum(partial) + + def dK_dX(self,X,X2,target): + """derivative of the covariance matrix with respect to X.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None] + ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf) + dK_dX += - np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2)) + target += np.sum(dK_dX*partial.T[:,:,None],0) + + def dKdiag_dX(self,X,target): + pass + def Gram_matrix(self,F,F1,F2,lower,upper): """ Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1. @@ -87,25 +116,3 @@ class Matern32(kernpart): #return(G) return(self.lengthscales**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscales**2/(3.*self.variance)*np.dot(F1lower,F1lower.T)) - def dK_dtheta(self,X,X2,target): - """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)""" - if X2 is None: X2 = X - dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1)) - dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist) - invdist = 1./np.where(dist!=0.,dist,np.inf) - dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3 - dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis] - np.add(target[:,:,0],dvar, target[:,:,0]) - np.add(target[:,:,1:],dl, target[:,:,1:]) - def dKdiag_dtheta(self,X,target): - """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)""" - np.add(target[:,0],1.,target[:,0]) - def dK_dX(self,X,X2,target): - """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*).""" - if X2 is None: X2 = X - dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None] - ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf) - target += - np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2)) - def dKdiag_dX(self,X,target): - pass - diff --git a/GPy/kern/Matern52.py b/GPy/kern/Matern52.py index 4af65a89..65059a5b 100644 --- a/GPy/kern/Matern52.py +++ b/GPy/kern/Matern52.py @@ -33,33 +33,61 @@ class Matern52(kernpart): self.Nparam = self.D + 1 self.name = 'Mat52' self.set_param(np.hstack((variance,lengthscales))) - self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S + def get_param(self): """return the value of the parameters.""" return np.hstack((self.variance,self.lengthscales)) + def set_param(self,x): """set the value of the parameters.""" assert x.size==(self.D+1) self.variance = x[0] self.lengthscales = x[1:] - self.lengthscales2 = np.square(self.lengthscales) - self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S + def get_param_names(self): """return parameter names.""" if self.D==1: return ['variance','lengthscale'] else: return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)] + def K(self,X,X2,target): """Compute the covariance matrix between X and X2.""" if X2 is None: X2 = X dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1)) np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target) + def Kdiag(self,X,target): """Compute the diagonal of the covariance matrix associated to X.""" np.add(target,self.variance,target) + def dK_dtheta(self,partial,X,X2,target): + """derivative of the covariance matrix with respect to the parameters.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1)) + invdist = 1./np.where(dist!=0.,dist,np.inf) + dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3 + dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist) + dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis] + target[0] += np.sum(dvar*partial) + target[1:] += (dl*partial[:,:,None]).sum(0).sum(0) + + def dKdiag_dtheta(self,X,target): + """derivative of the diagonal of the covariance matrix with respect to the parameters.""" + target[0] += np.sum(partial) + + def dK_dX(self,X,X2,target): + """derivative of the covariance matrix with respect to X.""" + if X2 is None: X2 = X + dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None] + ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf) + dK_dX += - np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2)) + target += np.sum(dK_dX*partial.T[:,:,None],0) + + def dKdiag_dX(self,X,target): + pass + def Gram_matrix(self,F,F1,F2,F3,lower,upper): """ Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1. @@ -91,26 +119,5 @@ class Matern52(kernpart): orig2 = 3./5*self.lengthscales**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T)) return(1./self.variance* (G_coef*G + orig + orig2)) - def dK_dtheta(self,X,X2,target): - """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)""" - if X2 is None: X2 = X - dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1)) - invdist = 1./np.where(dist!=0.,dist,np.inf) - dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3 - dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist) - dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis] - np.add(target[:,:,0],dvar, target[:,:,0]) - np.add(target[:,:,1:],dl, target[:,:,1:]) - def dKdiag_dtheta(self,X,target): - """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)""" - np.add(target[:,0],1.,target[:,0]) - def dK_dX(self,X,X2,target): - """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*).""" - if X2 is None: X2 = X - dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None] - ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf) - target += - np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2)) - def dKdiag_dX(self,X,target): - pass diff --git a/GPy/kern/exponential.py b/GPy/kern/exponential.py index 402ebd82..ba97881e 100644 --- a/GPy/kern/exponential.py +++ b/GPy/kern/exponential.py @@ -62,7 +62,7 @@ class exponential(kernpart): np.add(target,self.variance,target) def dK_dtheta(self,partial,X,X2,target): - """derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)""" + """derivative of the covariance matrix with respect to the parameters.""" if X2 is None: X2 = X dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1)) invdist = 1./np.where(dist!=0.,dist,np.inf) @@ -73,12 +73,12 @@ class exponential(kernpart): target[1:] += (dl*partial[:,:,None]).sum(0).sum(0) def dKdiag_dtheta(self,partial,X,target): - """derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)""" + """derivative of the diagonal of the covariance matrix with respect to the parameters.""" #NB: derivative of diagonal elements wrt lengthscale is 0 target[0] += np.sum(partial) def dK_dX(self,X,X2,target): - """derivative of the covariance matrix with respect to X (*! shape is NxMxD !*).""" + """derivative of the covariance matrix with respect to X.""" if X2 is None: X2 = X dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None] ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf) diff --git a/GPy/kern/linear.py b/GPy/kern/linear.py index d3e3f42c..7246244e 100644 --- a/GPy/kern/linear.py +++ b/GPy/kern/linear.py @@ -21,6 +21,7 @@ class linear(kernpart): self.Nparam = 1 self.name = 'linear' self.set_param(variance) + self._Xcache, self._X2cache = np.empty(shape=(2,)) def get_param(self): return self.variance @@ -32,7 +33,8 @@ class linear(kernpart): return ['variance'] def K(self,X,X2,target): - target += self.variance * np.dot(X, X2.T) + self._K_computations(X, X2) + target += self.variance * self._dot_product def Kdiag(self,X,target): np.add(target,np.sum(self.variance*np.square(X),-1),target) @@ -42,7 +44,9 @@ class linear(kernpart): Computes the derivatives wrt theta Return shape is NxMx(Ntheta) """ - product = np.dot(X, X2.T) + self._K_computations(X, X2) + product = self._dot_product + # product = np.dot(X, X2.T) target += np.sum(product*partial) def dK_dX(self,partial,X,X2,target): @@ -51,6 +55,20 @@ class linear(kernpart): def dKdiag_dtheta(self,partial,X,target): target += np.sum(partial*np.square(X).sum(1)) + def _K_computations(self,X,X2): + # (Nicolo) changed the logic here. If X2 is None, we want to cache + # (X,X). In practice X2 should always be passed. + if X2 is None: + X2 = X + if not (np.all(X==self._Xcache) and np.all(X2==self._X2cache)): + self._Xcache = X + self._X2cache = X2 + self._dot_product = np.dot(X,X2.T) + else: + # print "Cache hit!" + pass # TODO: insert debug message here (logging framework) + + # def psi0(self,Z,mu,S,target): # expected = np.square(mu) + S # np.add(target,np.sum(self.variance*expected),target) diff --git a/GPy/kern/rbf_ARD.py b/GPy/kern/rbf_ARD.py index f1e5f36a..1f90bb0a 100644 --- a/GPy/kern/rbf_ARD.py +++ b/GPy/kern/rbf_ARD.py @@ -30,6 +30,7 @@ class rbf_ARD(kernpart): def get_param(self): return np.hstack((self.variance,self.lengthscales)) + def set_param(self,x): assert x.size==(self.D+1) self.variance = x[0] @@ -37,61 +38,73 @@ class rbf_ARD(kernpart): self.lengthscales2 = np.square(self.lengthscales) #reset cached results self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S + def get_param_names(self): if self.D==1: return ['variance','lengthscale'] else: return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)] + def K(self,X,X2,target): self._K_computations(X,X2) np.add(self.variance*self._K_dvar, target,target) + def Kdiag(self,X,target): np.add(target,self.variance,target) - def dK_dtheta(self,X,X2,target): - """Return shape is NxMx(Ntheta)""" + + def dK_dtheta(self,partial,X,X2,target): self._K_computations(X,X2) dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscales - np.add(target[:,:,0],self._K_dvar, target[:,:,0]) - np.add(target[:,:,1:],dl, target[:,:,1:]) + target[0] += np.sum(self._K_dvar*partial) + target[1:] += (dl*partial[:,:,None]).sum(0).sum(0) + def dKdiag_dtheta(self,X,target): - np.add(target[:,0],1.,target[:,0]) + target[0] += np.sum(partial) + def dK_dX(self,X,X2,target): self._K_computations(X,X2) dZ = self.variance*self._K_dvar[:,:,None]*self._K_dist/self.lengthscales2 - np.add(target,-dZ.transpose(1,0,2),target) + dK_dX = -dZ.transpose(1,0,2) + target += np.sum(dK_dX*partial.T[:,:,None],0) + + def dKdiag_dX(self,partial,X,target): + pass + def psi0(self,Z,mu,S,target): - np.add(target, self.variance, target) - def dpsi0_dtheta(self,Z,mu,S,target): - target[:,0] += 1. + target += self.variance + + def dpsi0_dtheta(self,partial,Z,mu,S,target): + target[0] += 1. + def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S): pass + def psi1(self,Z,mu,S,target): - """Think N,M,Q """ self._psi_computations(Z,mu,S) np.add(target, self._psi1,target) - def dpsi1_dtheta(self,Z,mu,S,target): - """N,Q,(Ntheta)""" + def dpsi1_dtheta(self,partial,Z,mu,S,target): self._psi_computations(Z,mu,S) denom_deriv = S[:,None,:]/(self.lengthscales**3+self.lengthscales*S[:,None,:]) d_length = self._psi1[:,:,None]*(self.lengthscales*np.square(self._psi1_dist/(self.lengthscales2+S[:,None,:])) + denom_deriv) - target[:,:,0] += self._psi1/self.variance - target[:,:,1:] += d_length - def dpsi1_dZ(self,Z,mu,S,target): + target[0] += np.sum(partial*self._psi1/self.variance) + target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0) + + def dpsi1_dZ(self,partial,Z,mu,S,target): self._psi_computations(Z,mu,S) np.add(target,-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,target) + target += np.sum(partial[:,:,None]*-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,0) - def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S): + def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S): """return shapes are N,M,Q""" self._psi_computations(Z,mu,S) tmp = self._psi1[:,:,None]/self.lengthscales2/self._psi1_denom - np.add(target_mu,tmp*self._psi1_dist,target_mu) - np.add(target_S, 0.5*tmp*(self._psi1_dist_sq-1), target_S) + target_mu += np.sum(partial*tmp*self._psi1_dist,1) + target_S += np.sum(partial*0.5*tmp*(self._psi1_dist_sq-1),1) def psi2(self,Z,mu,S,target): - """shape N,M,M""" self._psi_computations(Z,mu,S) - np.add(target, self._psi2,target) + target += self._psi2.sum(0) #TODO: psi2 should be NxMxM (for het. noise) def dpsi2_dtheta(self,Z,mu,S,target): """Shape N,M,M,Ntheta""" @@ -99,21 +112,21 @@ class rbf_ARD(kernpart): d_var = np.sum(2.*self._psi2/self.variance,0) d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscales2)/(self.lengthscales*self._psi2_denom) d_length = d_length.sum(0) - target[:,:,0] += d_var - target[:,:,1:] += d_length + target[0] += np.sum(partial*d_var) + target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0) def dpsi2_dZ(self,Z,mu,S,target): """Returns shape N,M,M,Q""" self._psi_computations(Z,mu,S) dZ = self._psi2[:,:,:,None]/self.lengthscales2*(-0.5*self._psi2_Zdist + self._psi2_mudist/self._psi2_denom) - target += dZ + target += np.sum(partial[None,:,:,None]*dZ,0).sum(1) def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S): """Think N,M,M,Q """ self._psi_computations(Z,mu,S) tmp = self._psi2[:,:,:,None]/self.lengthscales2/self._psi2_denom - np.add(target_mu, -tmp*(2.*self._psi2_mudist),target_mu) #N,M,M,Q - np.add(target_S, tmp*(2.*self._psi2_mudist_sq-1), target_S) #N,M,M,Q + target_mu += (partial*-tmp*2.*self._psi2_mudist).sum(1).sum(1) + target_S += (partial*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1) def _K_computations(self,X,X2): if not (np.all(X==self._X) and np.all(X2==self._X2)): diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py index a221ad31..38aeef08 100644 --- a/GPy/models/sparse_GP_regression.py +++ b/GPy/models/sparse_GP_regression.py @@ -10,6 +10,10 @@ from .. import kern from ..inference.likelihoods import likelihood from GP_regression import GP_regression +#Still TODO: +# make use of slices properly (kernel can now do this) +# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) + class sparse_GP_regression(GP_regression): """ Variational sparse GP model (Regression) @@ -22,6 +26,8 @@ class sparse_GP_regression(GP_regression): :type kernel: a GPy kernel :param Z: inducing inputs (optional, see note) :type Z: np.ndarray (M x Q) | None + :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) + :type X_uncertainty: np.ndarray (N x Q) | None :param Zslices: slices for the inducing inputs (see slicing TODO: link) :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) :type M: int @@ -31,7 +37,7 @@ class sparse_GP_regression(GP_regression): :type normalize_(X|Y): bool """ - def __init__(self,X,Y,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False): + def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False): self.beta = beta if Z is None: self.Z = np.random.permutation(X.copy())[:M] @@ -40,10 +46,20 @@ class sparse_GP_regression(GP_regression): assert Z.shape[1]==X.shape[1] self.Z = Z self.M = Z.shape[1] + if X_uncertainty is None: + self.has_uncertain_inputs=False + else: + assert X_uncertainty.shape==X.shape + self.has_uncertain_inputs=False + self.X_uncertainty = X_uncertainty - GP_regression.__init__(self, X, Y, kernel = kernel, normalize_X = normalize_X, normalize_Y = normalize_Y) + GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y) self.trYYT = np.sum(np.square(self.Y)) + #normalise X uncertainty also + if self.has_uncertain_inputs: + self.X_uncertainty /= np.square(self._Xstd) + def set_param(self, p): self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) self.beta = p[self.M*self.Q] @@ -54,18 +70,23 @@ class sparse_GP_regression(GP_regression): def _compute_kernel_matrices(self): # kernel computations, using BGPLVM notation - #TODO: the following can be switched out in the case of uncertain inputs (or the BGPLVM!) #TODO: slices for psi statistics (easy enough) self.Kmm = self.kern.K(self.Z) - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() - self.psi1 = self.kern.K(self.Z,self.X) - self.psi2 = np.dot(self.psi1,self.psi1.T) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() + self.psi1 = self.kern.K(self.Z,self.X) + self.psi2 = np.dot(self.psi1,self.psi1.T) def _computations(self): # TODO find routine to multiply triangular matrices - self.psi1Y = np.dot(self.psi1, self.Y) - self.psi1YYpsi1 = np.dot(self.psi1Y, self.psi1Y.T) + self.V = self.beta*self.Y + self.psi1V = np.dot(self.psi1, self.V) + self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) self.Lm = jitchol(self.Kmm) self.Lmi = chol_inv(self.Lm) self.Kmmi = np.dot(self.Lmi.T, self.Lmi) @@ -77,25 +98,19 @@ class sparse_GP_regression(GP_regression): self.LLambdai = np.dot(self.LBi, self.Lmi) self.trace_K = self.psi0 - np.trace(self.A) self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) - self.C = mdot(self.LLambdai, self.psi1Y) - self.G = mdot(self.LBL_inv, self.psi1YYpsi1, self.LBL_inv.T) + self.C = mdot(self.LLambdai, self.psi1V) + self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) - # Computes dL_dpsi + # Compute dL_dpsi self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) - dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.Y) # this is sane. - tmp = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1) - self.dL_dpsi1 = self.beta2 * tmp - self.dL_dpsi2 = (- 0.5 * self.D * self.beta * (self.LBL_inv - self.Kmmi) - - self.beta**3 * 0.5 * self.G) + dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.V) + self.dL_dpsi1 = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1) + self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) - # Computes dL_dKmm TODO: nicer precomputations - - tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) - self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB - self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC - tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi) - - self.beta*mdot(self.G, self.psi2, self.Kmmi)) - self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE + # Compute dL_dKmm + self.dL_dKmm = -0.5 * self.beta * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB + self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE def get_param(self): return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()]) @@ -104,65 +119,83 @@ class sparse_GP_regression(GP_regression): return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names() def log_likelihood(self): + """ + Compute the (lower bound on the) log marginal likelihood + """ A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) B = -0.5*self.beta*self.D*self.trace_K C = -self.D * np.sum(np.log(np.diag(self.LB))) D = -0.5*self.beta*self.trYYT - E = +0.5*self.beta2*np.sum(self.psi1YYpsi1 * self.LBL_inv) - + E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) return A+B+C+D+E def dL_dbeta(self): - """ compute the gradient of the log likelihood wrt beta. - TODO: suport heteroscedatic noise""" + """ + Compute the gradient of the log likelihood wrt beta. + TODO: suport heteroscedatic noise + """ dA_dbeta = 0.5 * self.N*self.D/self.beta dB_dbeta = - 0.5 * self.D * self.trace_K dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A) dD_dbeta = - 0.5 * self.trYYT - tmp = mdot(self.LBi.T, self.LLambdai, self.psi1Y) - dE_dbeta = (self.beta * np.sum(np.square(self.C)) - 0.5 * self.beta2 - * np.sum(self.A * np.dot(tmp, tmp.T))) + tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V) + dE_dbeta = np.sum(np.square(self.C))/self.beta - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)) return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta) def dL_dtheta(self): - #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) - + """ + Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel + """ dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) - dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) - dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) return dL_dtheta def dL_dZ(self): - #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) - + """ + The derivative of the bound wrt the inducing inputs Z + """ dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ - dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) + if self.has_uncertain_inputs: + dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) return dL_dZ def log_likelihood_gradients(self): return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) - def _raw_predict(self,Xnew,slices): + def _raw_predict(self, Xnew, slices): """Internal helper function for making predictions, does not account for normalisation""" Kx = self.kern.K(self.Z, Xnew) Kxx = self.kern.K(Xnew) - mu = self.beta * mdot(Kx.T, self.LBL_inv, self.psi1Y) + mu = mdot(Kx.T, self.LBL_inv, self.psi1V) var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case. return mu,var - def plot(self,*args,**kwargs): + def plot(self, *args, **kwargs): """ Plot the fitted model: just call the GP_regression plot function and then add inducing inputs """ GP_regression.plot(self,*args,**kwargs) if self.Q==1: pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) + if self.has_uncertain_inputs: + pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten())) if self.Q==2: pb.plot(self.Z[:,0],self.Z[:,1],'wo') diff --git a/GPy/models/uncertain_input_GP_regression.py b/GPy/models/uncertain_input_GP_regression.py deleted file mode 100644 index 66724b07..00000000 --- a/GPy/models/uncertain_input_GP_regression.py +++ /dev/null @@ -1,70 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -import numpy as np -import pylab as pb -from ..util.linalg import mdot, jitchol, chol_inv, pdinv -from ..util.plot import gpplot -from .. import kern -from ..inference.likelihoods import likelihood -from sparse_GP_regression import sparse_GP_regression - -class uncertain_input_GP_regression(sparse_GP_regression): - """ - Variational sparse GP model (Regression) with uncertainty on the inputs - - :param X: inputs - :type X: np.ndarray (N x Q) - :param X_uncertainty: uncertainty on X (Gaussian variances, assumed isotrpic) - :type X_uncertainty: np.ndarray (N x Q) - :param Y: observed data - :type Y: np.ndarray of observations (N x D) - :param kernel : the kernel/covariance function. See link kernels - :type kernel: a GPy kernel - :param Z: inducing inputs (optional, see note) - :type Z: np.ndarray (M x Q) | None - :param Zslices: slices for the inducing inputs (see slicing TODO: link) - :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) - :type M: int - :param beta: noise precision. TODO> ignore beta if doing EP - :type beta: float - :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) - :type normalize_(X|Y): bool - """ - - def __init__(self,X,Y,X_uncertainty,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False): - self.X_uncertainty = X_uncertainty - sparse_GP_regression.__init__(self, X, Y, kernel = kernel, beta = beta, normalize_X = normalize_X, normalize_Y = normalize_Y) - self.trYYT = np.sum(np.square(self.Y)) - - def _compute_kernel_matrices(self): - # kernel computations, using BGPLVM notation - #TODO: slices for psi statistics (easy enough) - self.Kmm = self.kern.K(self.Z) - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() - self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) - - def dL_dtheta(self): - #re-cast computations in psi2 back to psi1: - dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) - dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) - dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) - dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape - return dL_dtheta - - def dL_dZ(self): - dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ - dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) - dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) - return dL_dZ - - def plot(self,*args,**kwargs): - """ - Plot the fitted model: just call the sparse GP_regression plot function and then add - markers to represent uncertainty on the inputs - """ - sparse_GP_regression.plot(self,*args,**kwargs) - if self.Q==1: - pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten())) - diff --git a/GPy/models/uncollapsed_sparse_GP.py b/GPy/models/uncollapsed_sparse_GP.py new file mode 100644 index 00000000..b5d4b054 --- /dev/null +++ b/GPy/models/uncollapsed_sparse_GP.py @@ -0,0 +1,128 @@ +# Copyright (c) 2012 James Hensman +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, pdinv +from ..util.plot import gpplot +from .. import kern +from ..inference.likelihoods import likelihood +from sparse_GP_regression import sparse_GP_regression + +class uncollapsed_sparse_GP(sparse_GP_regression): + """ + Variational sparse GP model (Regression), where the approximating distribution q(u) is represented explicitly + + :param X: inputs + :type X: np.ndarray (N x Q) + :param Y: observed data + :type Y: np.ndarray of observations (N x D) + :param q_u: canonical parameters of the distribution squasehd into a 1D array + :type q_u: np.ndarray + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param Zslices: slices for the inducing inputs (see slicing TODO: link) + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param beta: noise precision. TODO> ignore beta if doing EP + :type beta: float + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self, X, Y, q_u=None, *args, **kwargs) + D = Y.shape[1] + if q_u is None: + if Z is None: + M = Z.shape[0] + else: + M=M + self.set_vb_param(np.hstack((np.ones(M*D)),np.eye(M).flatten())) + sparse_GP_regression.__init__(self, X, Y, *args, **kwargs) + + def _computations(self): + self.V = self.beta*self.Y + self.psi1V = np.dot(self.psi1, self.V) + self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) + self.Lm = jitchol(self.Kmm) + self.Lmi = chol_inv(self.Lm) + self.Kmmi = np.dot(self.Lmi.T, self.Lmi) + self.A = mdot(self.Lmi, self.psi2, self.Lmi.T) + self.B = np.eye(self.M) + self.beta * self.A + self.Lambda = mdot(self.Lmi.T,self.B,sel.Lmi) + + # Compute dL_dpsi + self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + self.dL_dpsi1 = + self.dL_dpsi2 = + + # Compute dL_dKmm + self.dL_dKmm = + self.dL_dKmm += + self.dL_dKmm += + + def log_likelihood(self): + """ + Compute the (lower bound on the) log marginal likelihood + """ + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) + B = -0.5*self.beta*self.D*self.trace_K + C = -self.D *(self.Kmm_hld +0.5*np.sum(self.Lambda * self.mmT_S) + self.M/2.) + E = -0.5*self.beta*self.trYYT + F = np.sum(np.dot(self.V.T,self.projected_mean)) + return A+B+C+D+E+F + + def dL_dbeta(self): + """ + Compute the gradient of the log likelihood wrt beta. + TODO: suport heteroscedatic noise + """ + dA_dbeta = 0.5 * self.N*self.D/self.beta + dB_dbeta = - 0.5 * self.D * self.trace_K + dC_dbeta = - 0.5 * self.D * #TODO + dD_dbeta = - 0.5 * self.trYYT + + return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta) + + def _raw_predict(self, Xnew, slices): + """Internal helper function for making predictions, does not account for normalisation""" + + #TODO + return mu,var + + def set_vb_param(self,vb_param): + """set the distribution q(u) from the canonical parameters""" + self.q_u_prec = -2.*vb_param[self.M*self.D:].reshape(self.M,self.M) + self.q_u_prec_L = jitchol(self.q_u_prec) + self.q_u_cov_L = chol_inv(self.q_u_prec_L) + self.q_u_cov = np.dot(self.q_u_cov_L,self.q_u_cov_L.T) + self.q_u_mean = -2.*np.dot(self.q_u_cov,vb_param[:self.M*self.D].reshape(self.M,self.D)) + + self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov) + + self.q_u_canonical = (np.dot(self.q_u_prec, self.q_u_mean),-0.5*self.q_u_prec) + #TODO: computations now? + + def get_vb_param(self): + """ + Return the canonical parameters of the distribution q(u) + """ + return np.hstack([e.flatten() for e in self.q_u_canonical]) + + def vb_grad_natgrad(self): + """ + Compute the gradients of the lower bound wrt the canonical and + Expectation parameters of u. + + Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family) + """ + foobar #TODO + + def plot(self, *args, **kwargs): + """ + add the distribution q(u) to the plot from sparse_GP_regression + """ + sparse_GP_regression.plot(self,*args,**kwargs) + #TODO: plot the q(u) dist.