mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-06 10:32:39 +02:00
Merge changes for model.py and optimization.py on comments.
This commit is contained in:
commit
60fd1c55dc
12 changed files with 336 additions and 195 deletions
|
|
@ -18,6 +18,7 @@ class model(parameterised):
|
||||||
self.optimization_runs = []
|
self.optimization_runs = []
|
||||||
self.sampling_runs = []
|
self.sampling_runs = []
|
||||||
self.set_param(self.get_param())
|
self.set_param(self.get_param())
|
||||||
|
self.preferred_optimizer = 'tnc'
|
||||||
def get_param(self):
|
def get_param(self):
|
||||||
raise NotImplementedError, "this needs to be implemented to utilise the model class"
|
raise NotImplementedError, "this needs to be implemented to utilise the model class"
|
||||||
def set_param(self,x):
|
def set_param(self,x):
|
||||||
|
|
@ -161,15 +162,18 @@ class model(parameterised):
|
||||||
else:
|
else:
|
||||||
self.expand_param(initial_parameters)
|
self.expand_param(initial_parameters)
|
||||||
|
|
||||||
def optimize(self, optimizer = 'tnc', start = None, **kwargs):
|
def optimize(self, optimizer=None, start=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
|
Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
|
||||||
kwargs are passed to the optimizer. They can be:
|
kwargs are passed to the optimizer. They can be:
|
||||||
|
|
||||||
:max_f_eval: maximum number of function evaluations
|
:max_f_eval: maximum number of function evaluations
|
||||||
:messages: whether to display during optimisation
|
:messages: whether to display during optimisation
|
||||||
|
:param optimzer: whice optimizer to use (defaults to self.preferred optimizer)
|
||||||
|
:type optimzer: string TODO: valid strings?
|
||||||
"""
|
"""
|
||||||
|
if optimizer is None:
|
||||||
|
optimizer = self.preferred_optimizer
|
||||||
|
|
||||||
def f(x):
|
def f(x):
|
||||||
self.expand_param(x)
|
self.expand_param(x)
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ np.random.seed(1)
|
||||||
print "sparse GPLVM with RBF kernel"
|
print "sparse GPLVM with RBF kernel"
|
||||||
|
|
||||||
N = 100
|
N = 100
|
||||||
|
M = 4
|
||||||
Q = 1
|
Q = 1
|
||||||
D = 2
|
D = 2
|
||||||
#generate GPLVM-like data
|
#generate GPLVM-like data
|
||||||
|
|
@ -17,7 +18,7 @@ k = GPy.kern.rbf(Q, 1.0, 2.0) + GPy.kern.white(Q, 0.00001)
|
||||||
K = k.K(X)
|
K = k.K(X)
|
||||||
Y = np.random.multivariate_normal(np.zeros(N),K,D).T
|
Y = np.random.multivariate_normal(np.zeros(N),K,D).T
|
||||||
|
|
||||||
m = GPy.models.sparse_GPLVM(Y, Q, M = 10)
|
m = GPy.models.sparse_GPLVM(Y, Q, M=M)
|
||||||
m.constrain_positive('(rbf|bias|noise)')
|
m.constrain_positive('(rbf|bias|noise)')
|
||||||
m.constrain_bounded('white', 1e-3, 0.1)
|
m.constrain_bounded('white', 1e-3, 0.1)
|
||||||
# m.plot()
|
# m.plot()
|
||||||
|
|
|
||||||
|
|
@ -12,6 +12,7 @@ import GPy
|
||||||
np.random.seed(2)
|
np.random.seed(2)
|
||||||
pb.ion()
|
pb.ion()
|
||||||
N = 500
|
N = 500
|
||||||
|
M = 5
|
||||||
|
|
||||||
######################################
|
######################################
|
||||||
## 1 dimensional example
|
## 1 dimensional example
|
||||||
|
|
@ -26,7 +27,7 @@ noise = GPy.kern.white(1)
|
||||||
kernel = rbf + noise
|
kernel = rbf + noise
|
||||||
|
|
||||||
# create simple GP model
|
# create simple GP model
|
||||||
m1 = GPy.models.sparse_GP_regression(X,Y,kernel, M = 10)
|
m1 = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
|
||||||
|
|
||||||
# contrain all parameters to be positive
|
# contrain all parameters to be positive
|
||||||
m1.constrain_positive('(variance|lengthscale|precision)')
|
m1.constrain_positive('(variance|lengthscale|precision)')
|
||||||
|
|
|
||||||
|
|
@ -23,8 +23,7 @@ class Optimizer():
|
||||||
:rtype: optimizer object.
|
:rtype: optimizer object.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, x_init, f_fp, f, fp , messages = False, max_f_eval = 1e4, ftol = None, gtol = None, xtol = None):
|
def __init__(self, x_init, f_fp, f, fp , messages=False, max_f_eval=1e4, ftol=None, gtol=None, xtol=None):
|
||||||
|
|
||||||
self.opt_name = None
|
self.opt_name = None
|
||||||
self.f_fp = f_fp
|
self.f_fp = f_fp
|
||||||
self.f = f
|
self.f = f
|
||||||
|
|
|
||||||
|
|
@ -39,11 +39,13 @@ class Matern32(kernpart):
|
||||||
def get_param(self):
|
def get_param(self):
|
||||||
"""return the value of the parameters."""
|
"""return the value of the parameters."""
|
||||||
return np.hstack((self.variance,self.lengthscales))
|
return np.hstack((self.variance,self.lengthscales))
|
||||||
|
|
||||||
def set_param(self,x):
|
def set_param(self,x):
|
||||||
"""set the value of the parameters."""
|
"""set the value of the parameters."""
|
||||||
assert x.size==(self.D+1)
|
assert x.size==(self.D+1)
|
||||||
self.variance = x[0]
|
self.variance = x[0]
|
||||||
self.lengthscales = x[1:]
|
self.lengthscales = x[1:]
|
||||||
|
|
||||||
def get_param_names(self):
|
def get_param_names(self):
|
||||||
"""return parameter names."""
|
"""return parameter names."""
|
||||||
if self.D==1:
|
if self.D==1:
|
||||||
|
|
@ -56,10 +58,37 @@ class Matern32(kernpart):
|
||||||
if X2 is None: X2 = X
|
if X2 is None: X2 = X
|
||||||
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
||||||
np.add(self.variance*(1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist), target,target)
|
np.add(self.variance*(1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist), target,target)
|
||||||
|
|
||||||
def Kdiag(self,X,target):
|
def Kdiag(self,X,target):
|
||||||
"""Compute the diagonal of the covariance matrix associated to X."""
|
"""Compute the diagonal of the covariance matrix associated to X."""
|
||||||
np.add(target,self.variance,target)
|
np.add(target,self.variance,target)
|
||||||
|
|
||||||
|
def dK_dtheta(self,partial,X,X2,target):
|
||||||
|
"""derivative of the covariance matrix with respect to the parameters."""
|
||||||
|
if X2 is None: X2 = X
|
||||||
|
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
||||||
|
dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
|
||||||
|
invdist = 1./np.where(dist!=0.,dist,np.inf)
|
||||||
|
dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
|
||||||
|
dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
|
||||||
|
target[0] += np.sum(dvar*partial)
|
||||||
|
target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
|
||||||
|
|
||||||
|
def dKdiag_dtheta(self,partial,X,target):
|
||||||
|
"""derivative of the diagonal of the covariance matrix with respect to the parameters."""
|
||||||
|
target[0] += np.sum(partial)
|
||||||
|
|
||||||
|
def dK_dX(self,X,X2,target):
|
||||||
|
"""derivative of the covariance matrix with respect to X."""
|
||||||
|
if X2 is None: X2 = X
|
||||||
|
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
|
||||||
|
ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
|
||||||
|
dK_dX += - np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
|
||||||
|
target += np.sum(dK_dX*partial.T[:,:,None],0)
|
||||||
|
|
||||||
|
def dKdiag_dX(self,X,target):
|
||||||
|
pass
|
||||||
|
|
||||||
def Gram_matrix(self,F,F1,F2,lower,upper):
|
def Gram_matrix(self,F,F1,F2,lower,upper):
|
||||||
"""
|
"""
|
||||||
Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
|
Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
|
||||||
|
|
@ -87,25 +116,3 @@ class Matern32(kernpart):
|
||||||
#return(G)
|
#return(G)
|
||||||
return(self.lengthscales**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscales**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
|
return(self.lengthscales**3/(12.*np.sqrt(3)*self.variance) * G + 1./self.variance*np.dot(Flower,Flower.T) + self.lengthscales**2/(3.*self.variance)*np.dot(F1lower,F1lower.T))
|
||||||
|
|
||||||
def dK_dtheta(self,X,X2,target):
|
|
||||||
"""derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
|
|
||||||
if X2 is None: X2 = X
|
|
||||||
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
|
||||||
dvar = (1+np.sqrt(3.)*dist)*np.exp(-np.sqrt(3.)*dist)
|
|
||||||
invdist = 1./np.where(dist!=0.,dist,np.inf)
|
|
||||||
dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
|
|
||||||
dl = (self.variance* 3 * dist * np.exp(-np.sqrt(3.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
|
|
||||||
np.add(target[:,:,0],dvar, target[:,:,0])
|
|
||||||
np.add(target[:,:,1:],dl, target[:,:,1:])
|
|
||||||
def dKdiag_dtheta(self,X,target):
|
|
||||||
"""derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
|
|
||||||
np.add(target[:,0],1.,target[:,0])
|
|
||||||
def dK_dX(self,X,X2,target):
|
|
||||||
"""derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
|
|
||||||
if X2 is None: X2 = X
|
|
||||||
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
|
|
||||||
ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
|
|
||||||
target += - np.transpose(3*self.variance*dist*np.exp(-np.sqrt(3)*dist)*ddist_dX,(1,0,2))
|
|
||||||
def dKdiag_dX(self,X,target):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -33,33 +33,61 @@ class Matern52(kernpart):
|
||||||
self.Nparam = self.D + 1
|
self.Nparam = self.D + 1
|
||||||
self.name = 'Mat52'
|
self.name = 'Mat52'
|
||||||
self.set_param(np.hstack((variance,lengthscales)))
|
self.set_param(np.hstack((variance,lengthscales)))
|
||||||
self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
|
|
||||||
|
|
||||||
def get_param(self):
|
def get_param(self):
|
||||||
"""return the value of the parameters."""
|
"""return the value of the parameters."""
|
||||||
return np.hstack((self.variance,self.lengthscales))
|
return np.hstack((self.variance,self.lengthscales))
|
||||||
|
|
||||||
def set_param(self,x):
|
def set_param(self,x):
|
||||||
"""set the value of the parameters."""
|
"""set the value of the parameters."""
|
||||||
assert x.size==(self.D+1)
|
assert x.size==(self.D+1)
|
||||||
self.variance = x[0]
|
self.variance = x[0]
|
||||||
self.lengthscales = x[1:]
|
self.lengthscales = x[1:]
|
||||||
self.lengthscales2 = np.square(self.lengthscales)
|
|
||||||
self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
|
|
||||||
def get_param_names(self):
|
def get_param_names(self):
|
||||||
"""return parameter names."""
|
"""return parameter names."""
|
||||||
if self.D==1:
|
if self.D==1:
|
||||||
return ['variance','lengthscale']
|
return ['variance','lengthscale']
|
||||||
else:
|
else:
|
||||||
return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
|
return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
|
||||||
|
|
||||||
def K(self,X,X2,target):
|
def K(self,X,X2,target):
|
||||||
"""Compute the covariance matrix between X and X2."""
|
"""Compute the covariance matrix between X and X2."""
|
||||||
if X2 is None: X2 = X
|
if X2 is None: X2 = X
|
||||||
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
||||||
np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
|
np.add(self.variance*(1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist), target,target)
|
||||||
|
|
||||||
def Kdiag(self,X,target):
|
def Kdiag(self,X,target):
|
||||||
"""Compute the diagonal of the covariance matrix associated to X."""
|
"""Compute the diagonal of the covariance matrix associated to X."""
|
||||||
np.add(target,self.variance,target)
|
np.add(target,self.variance,target)
|
||||||
|
|
||||||
|
def dK_dtheta(self,partial,X,X2,target):
|
||||||
|
"""derivative of the covariance matrix with respect to the parameters."""
|
||||||
|
if X2 is None: X2 = X
|
||||||
|
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
||||||
|
invdist = 1./np.where(dist!=0.,dist,np.inf)
|
||||||
|
dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
|
||||||
|
dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
|
||||||
|
dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
|
||||||
|
target[0] += np.sum(dvar*partial)
|
||||||
|
target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
|
||||||
|
|
||||||
|
def dKdiag_dtheta(self,X,target):
|
||||||
|
"""derivative of the diagonal of the covariance matrix with respect to the parameters."""
|
||||||
|
target[0] += np.sum(partial)
|
||||||
|
|
||||||
|
def dK_dX(self,X,X2,target):
|
||||||
|
"""derivative of the covariance matrix with respect to X."""
|
||||||
|
if X2 is None: X2 = X
|
||||||
|
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
|
||||||
|
ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
|
||||||
|
dK_dX += - np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
|
||||||
|
target += np.sum(dK_dX*partial.T[:,:,None],0)
|
||||||
|
|
||||||
|
def dKdiag_dX(self,X,target):
|
||||||
|
pass
|
||||||
|
|
||||||
def Gram_matrix(self,F,F1,F2,F3,lower,upper):
|
def Gram_matrix(self,F,F1,F2,F3,lower,upper):
|
||||||
"""
|
"""
|
||||||
Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
|
Return the Gram matrix of the vector of functions F with respect to the RKHS norm. The use of this function is limited to D=1.
|
||||||
|
|
@ -91,26 +119,5 @@ class Matern52(kernpart):
|
||||||
orig2 = 3./5*self.lengthscales**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
|
orig2 = 3./5*self.lengthscales**2 * ( np.dot(F1lower,F1lower.T) + 1./8*np.dot(Flower,F2lower.T) + 1./8*np.dot(F2lower,Flower.T))
|
||||||
return(1./self.variance* (G_coef*G + orig + orig2))
|
return(1./self.variance* (G_coef*G + orig + orig2))
|
||||||
|
|
||||||
def dK_dtheta(self,X,X2,target):
|
|
||||||
"""derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
|
|
||||||
if X2 is None: X2 = X
|
|
||||||
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
|
||||||
invdist = 1./np.where(dist!=0.,dist,np.inf)
|
|
||||||
dist2M = np.square(X[:,None,:]-X2[None,:,:])/self.lengthscales**3
|
|
||||||
dvar = (1+np.sqrt(5.)*dist+5./3*dist**2)*np.exp(-np.sqrt(5.)*dist)
|
|
||||||
dl = (self.variance * 5./3 * dist * (1 + np.sqrt(5.)*dist ) * np.exp(-np.sqrt(5.)*dist))[:,:,np.newaxis] * dist2M*invdist[:,:,np.newaxis]
|
|
||||||
np.add(target[:,:,0],dvar, target[:,:,0])
|
|
||||||
np.add(target[:,:,1:],dl, target[:,:,1:])
|
|
||||||
def dKdiag_dtheta(self,X,target):
|
|
||||||
"""derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
|
|
||||||
np.add(target[:,0],1.,target[:,0])
|
|
||||||
def dK_dX(self,X,X2,target):
|
|
||||||
"""derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
|
|
||||||
if X2 is None: X2 = X
|
|
||||||
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
|
|
||||||
ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
|
|
||||||
target += - np.transpose(self.variance*5./3*dist*(1+np.sqrt(5)*dist)*np.exp(-np.sqrt(5)*dist)*ddist_dX,(1,0,2))
|
|
||||||
def dKdiag_dX(self,X,target):
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -62,7 +62,7 @@ class exponential(kernpart):
|
||||||
np.add(target,self.variance,target)
|
np.add(target,self.variance,target)
|
||||||
|
|
||||||
def dK_dtheta(self,partial,X,X2,target):
|
def dK_dtheta(self,partial,X,X2,target):
|
||||||
"""derivative of the cross-covariance matrix with respect to the parameters (shape is NxMxNparam)"""
|
"""derivative of the covariance matrix with respect to the parameters."""
|
||||||
if X2 is None: X2 = X
|
if X2 is None: X2 = X
|
||||||
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))
|
||||||
invdist = 1./np.where(dist!=0.,dist,np.inf)
|
invdist = 1./np.where(dist!=0.,dist,np.inf)
|
||||||
|
|
@ -73,12 +73,12 @@ class exponential(kernpart):
|
||||||
target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
|
target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
|
||||||
|
|
||||||
def dKdiag_dtheta(self,partial,X,target):
|
def dKdiag_dtheta(self,partial,X,target):
|
||||||
"""derivative of the diagonal of the covariance matrix with respect to the parameters (shape is NxNparam)"""
|
"""derivative of the diagonal of the covariance matrix with respect to the parameters."""
|
||||||
#NB: derivative of diagonal elements wrt lengthscale is 0
|
#NB: derivative of diagonal elements wrt lengthscale is 0
|
||||||
target[0] += np.sum(partial)
|
target[0] += np.sum(partial)
|
||||||
|
|
||||||
def dK_dX(self,X,X2,target):
|
def dK_dX(self,X,X2,target):
|
||||||
"""derivative of the covariance matrix with respect to X (*! shape is NxMxD !*)."""
|
"""derivative of the covariance matrix with respect to X."""
|
||||||
if X2 is None: X2 = X
|
if X2 is None: X2 = X
|
||||||
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
|
dist = np.sqrt(np.sum(np.square((X[:,None,:]-X2[None,:,:])/self.lengthscales),-1))[:,:,None]
|
||||||
ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
|
ddist_dX = (X[:,None,:]-X2[None,:,:])/self.lengthscales**2/np.where(dist!=0.,dist,np.inf)
|
||||||
|
|
|
||||||
|
|
@ -21,6 +21,7 @@ class linear(kernpart):
|
||||||
self.Nparam = 1
|
self.Nparam = 1
|
||||||
self.name = 'linear'
|
self.name = 'linear'
|
||||||
self.set_param(variance)
|
self.set_param(variance)
|
||||||
|
self._Xcache, self._X2cache = np.empty(shape=(2,))
|
||||||
|
|
||||||
def get_param(self):
|
def get_param(self):
|
||||||
return self.variance
|
return self.variance
|
||||||
|
|
@ -32,7 +33,8 @@ class linear(kernpart):
|
||||||
return ['variance']
|
return ['variance']
|
||||||
|
|
||||||
def K(self,X,X2,target):
|
def K(self,X,X2,target):
|
||||||
target += self.variance * np.dot(X, X2.T)
|
self._K_computations(X, X2)
|
||||||
|
target += self.variance * self._dot_product
|
||||||
|
|
||||||
def Kdiag(self,X,target):
|
def Kdiag(self,X,target):
|
||||||
np.add(target,np.sum(self.variance*np.square(X),-1),target)
|
np.add(target,np.sum(self.variance*np.square(X),-1),target)
|
||||||
|
|
@ -42,7 +44,9 @@ class linear(kernpart):
|
||||||
Computes the derivatives wrt theta
|
Computes the derivatives wrt theta
|
||||||
Return shape is NxMx(Ntheta)
|
Return shape is NxMx(Ntheta)
|
||||||
"""
|
"""
|
||||||
product = np.dot(X, X2.T)
|
self._K_computations(X, X2)
|
||||||
|
product = self._dot_product
|
||||||
|
# product = np.dot(X, X2.T)
|
||||||
target += np.sum(product*partial)
|
target += np.sum(product*partial)
|
||||||
|
|
||||||
def dK_dX(self,partial,X,X2,target):
|
def dK_dX(self,partial,X,X2,target):
|
||||||
|
|
@ -51,6 +55,20 @@ class linear(kernpart):
|
||||||
def dKdiag_dtheta(self,partial,X,target):
|
def dKdiag_dtheta(self,partial,X,target):
|
||||||
target += np.sum(partial*np.square(X).sum(1))
|
target += np.sum(partial*np.square(X).sum(1))
|
||||||
|
|
||||||
|
def _K_computations(self,X,X2):
|
||||||
|
# (Nicolo) changed the logic here. If X2 is None, we want to cache
|
||||||
|
# (X,X). In practice X2 should always be passed.
|
||||||
|
if X2 is None:
|
||||||
|
X2 = X
|
||||||
|
if not (np.all(X==self._Xcache) and np.all(X2==self._X2cache)):
|
||||||
|
self._Xcache = X
|
||||||
|
self._X2cache = X2
|
||||||
|
self._dot_product = np.dot(X,X2.T)
|
||||||
|
else:
|
||||||
|
# print "Cache hit!"
|
||||||
|
pass # TODO: insert debug message here (logging framework)
|
||||||
|
|
||||||
|
|
||||||
# def psi0(self,Z,mu,S,target):
|
# def psi0(self,Z,mu,S,target):
|
||||||
# expected = np.square(mu) + S
|
# expected = np.square(mu) + S
|
||||||
# np.add(target,np.sum(self.variance*expected),target)
|
# np.add(target,np.sum(self.variance*expected),target)
|
||||||
|
|
|
||||||
|
|
@ -30,6 +30,7 @@ class rbf_ARD(kernpart):
|
||||||
|
|
||||||
def get_param(self):
|
def get_param(self):
|
||||||
return np.hstack((self.variance,self.lengthscales))
|
return np.hstack((self.variance,self.lengthscales))
|
||||||
|
|
||||||
def set_param(self,x):
|
def set_param(self,x):
|
||||||
assert x.size==(self.D+1)
|
assert x.size==(self.D+1)
|
||||||
self.variance = x[0]
|
self.variance = x[0]
|
||||||
|
|
@ -37,61 +38,73 @@ class rbf_ARD(kernpart):
|
||||||
self.lengthscales2 = np.square(self.lengthscales)
|
self.lengthscales2 = np.square(self.lengthscales)
|
||||||
#reset cached results
|
#reset cached results
|
||||||
self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
|
self._Z, self._mu, self._S = np.empty(shape=(3,1)) # cached versions of Z,mu,S
|
||||||
|
|
||||||
def get_param_names(self):
|
def get_param_names(self):
|
||||||
if self.D==1:
|
if self.D==1:
|
||||||
return ['variance','lengthscale']
|
return ['variance','lengthscale']
|
||||||
else:
|
else:
|
||||||
return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
|
return ['variance']+['lengthscale_%i'%i for i in range(self.lengthscales.size)]
|
||||||
|
|
||||||
def K(self,X,X2,target):
|
def K(self,X,X2,target):
|
||||||
self._K_computations(X,X2)
|
self._K_computations(X,X2)
|
||||||
np.add(self.variance*self._K_dvar, target,target)
|
np.add(self.variance*self._K_dvar, target,target)
|
||||||
|
|
||||||
def Kdiag(self,X,target):
|
def Kdiag(self,X,target):
|
||||||
np.add(target,self.variance,target)
|
np.add(target,self.variance,target)
|
||||||
def dK_dtheta(self,X,X2,target):
|
|
||||||
"""Return shape is NxMx(Ntheta)"""
|
def dK_dtheta(self,partial,X,X2,target):
|
||||||
self._K_computations(X,X2)
|
self._K_computations(X,X2)
|
||||||
dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscales
|
dl = self._K_dvar[:,:,None]*self.variance*self._K_dist2/self.lengthscales
|
||||||
np.add(target[:,:,0],self._K_dvar, target[:,:,0])
|
target[0] += np.sum(self._K_dvar*partial)
|
||||||
np.add(target[:,:,1:],dl, target[:,:,1:])
|
target[1:] += (dl*partial[:,:,None]).sum(0).sum(0)
|
||||||
|
|
||||||
def dKdiag_dtheta(self,X,target):
|
def dKdiag_dtheta(self,X,target):
|
||||||
np.add(target[:,0],1.,target[:,0])
|
target[0] += np.sum(partial)
|
||||||
|
|
||||||
def dK_dX(self,X,X2,target):
|
def dK_dX(self,X,X2,target):
|
||||||
self._K_computations(X,X2)
|
self._K_computations(X,X2)
|
||||||
dZ = self.variance*self._K_dvar[:,:,None]*self._K_dist/self.lengthscales2
|
dZ = self.variance*self._K_dvar[:,:,None]*self._K_dist/self.lengthscales2
|
||||||
np.add(target,-dZ.transpose(1,0,2),target)
|
dK_dX = -dZ.transpose(1,0,2)
|
||||||
|
target += np.sum(dK_dX*partial.T[:,:,None],0)
|
||||||
|
|
||||||
|
def dKdiag_dX(self,partial,X,target):
|
||||||
|
pass
|
||||||
|
|
||||||
def psi0(self,Z,mu,S,target):
|
def psi0(self,Z,mu,S,target):
|
||||||
np.add(target, self.variance, target)
|
target += self.variance
|
||||||
def dpsi0_dtheta(self,Z,mu,S,target):
|
|
||||||
target[:,0] += 1.
|
def dpsi0_dtheta(self,partial,Z,mu,S,target):
|
||||||
|
target[0] += 1.
|
||||||
|
|
||||||
def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S):
|
def dpsi0_dmuS(self,Z,mu,S,target_mu,target_S):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def psi1(self,Z,mu,S,target):
|
def psi1(self,Z,mu,S,target):
|
||||||
"""Think N,M,Q """
|
|
||||||
self._psi_computations(Z,mu,S)
|
self._psi_computations(Z,mu,S)
|
||||||
np.add(target, self._psi1,target)
|
np.add(target, self._psi1,target)
|
||||||
|
|
||||||
def dpsi1_dtheta(self,Z,mu,S,target):
|
def dpsi1_dtheta(self,partial,Z,mu,S,target):
|
||||||
"""N,Q,(Ntheta)"""
|
|
||||||
self._psi_computations(Z,mu,S)
|
self._psi_computations(Z,mu,S)
|
||||||
denom_deriv = S[:,None,:]/(self.lengthscales**3+self.lengthscales*S[:,None,:])
|
denom_deriv = S[:,None,:]/(self.lengthscales**3+self.lengthscales*S[:,None,:])
|
||||||
d_length = self._psi1[:,:,None]*(self.lengthscales*np.square(self._psi1_dist/(self.lengthscales2+S[:,None,:])) + denom_deriv)
|
d_length = self._psi1[:,:,None]*(self.lengthscales*np.square(self._psi1_dist/(self.lengthscales2+S[:,None,:])) + denom_deriv)
|
||||||
target[:,:,0] += self._psi1/self.variance
|
target[0] += np.sum(partial*self._psi1/self.variance)
|
||||||
target[:,:,1:] += d_length
|
target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0)
|
||||||
def dpsi1_dZ(self,Z,mu,S,target):
|
|
||||||
|
def dpsi1_dZ(self,partial,Z,mu,S,target):
|
||||||
self._psi_computations(Z,mu,S)
|
self._psi_computations(Z,mu,S)
|
||||||
np.add(target,-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,target)
|
np.add(target,-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,target)
|
||||||
|
target += np.sum(partial[:,:,None]*-self._psi1[:,:,None]*self._psi1_dist/self.lengthscales2/self._psi1_denom,0)
|
||||||
|
|
||||||
def dpsi1_dmuS(self,Z,mu,S,target_mu,target_S):
|
def dpsi1_dmuS(self,partial,Z,mu,S,target_mu,target_S):
|
||||||
"""return shapes are N,M,Q"""
|
"""return shapes are N,M,Q"""
|
||||||
self._psi_computations(Z,mu,S)
|
self._psi_computations(Z,mu,S)
|
||||||
tmp = self._psi1[:,:,None]/self.lengthscales2/self._psi1_denom
|
tmp = self._psi1[:,:,None]/self.lengthscales2/self._psi1_denom
|
||||||
np.add(target_mu,tmp*self._psi1_dist,target_mu)
|
target_mu += np.sum(partial*tmp*self._psi1_dist,1)
|
||||||
np.add(target_S, 0.5*tmp*(self._psi1_dist_sq-1), target_S)
|
target_S += np.sum(partial*0.5*tmp*(self._psi1_dist_sq-1),1)
|
||||||
|
|
||||||
def psi2(self,Z,mu,S,target):
|
def psi2(self,Z,mu,S,target):
|
||||||
"""shape N,M,M"""
|
|
||||||
self._psi_computations(Z,mu,S)
|
self._psi_computations(Z,mu,S)
|
||||||
np.add(target, self._psi2,target)
|
target += self._psi2.sum(0) #TODO: psi2 should be NxMxM (for het. noise)
|
||||||
|
|
||||||
def dpsi2_dtheta(self,Z,mu,S,target):
|
def dpsi2_dtheta(self,Z,mu,S,target):
|
||||||
"""Shape N,M,M,Ntheta"""
|
"""Shape N,M,M,Ntheta"""
|
||||||
|
|
@ -99,21 +112,21 @@ class rbf_ARD(kernpart):
|
||||||
d_var = np.sum(2.*self._psi2/self.variance,0)
|
d_var = np.sum(2.*self._psi2/self.variance,0)
|
||||||
d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscales2)/(self.lengthscales*self._psi2_denom)
|
d_length = self._psi2[:,:,:,None]*(0.5*self._psi2_Zdist_sq*self._psi2_denom + 2.*self._psi2_mudist_sq + 2.*S[:,None,None,:]/self.lengthscales2)/(self.lengthscales*self._psi2_denom)
|
||||||
d_length = d_length.sum(0)
|
d_length = d_length.sum(0)
|
||||||
target[:,:,0] += d_var
|
target[0] += np.sum(partial*d_var)
|
||||||
target[:,:,1:] += d_length
|
target[1:] += (d_length*partial[:,:,None]).sum(0).sum(0)
|
||||||
|
|
||||||
def dpsi2_dZ(self,Z,mu,S,target):
|
def dpsi2_dZ(self,Z,mu,S,target):
|
||||||
"""Returns shape N,M,M,Q"""
|
"""Returns shape N,M,M,Q"""
|
||||||
self._psi_computations(Z,mu,S)
|
self._psi_computations(Z,mu,S)
|
||||||
dZ = self._psi2[:,:,:,None]/self.lengthscales2*(-0.5*self._psi2_Zdist + self._psi2_mudist/self._psi2_denom)
|
dZ = self._psi2[:,:,:,None]/self.lengthscales2*(-0.5*self._psi2_Zdist + self._psi2_mudist/self._psi2_denom)
|
||||||
target += dZ
|
target += np.sum(partial[None,:,:,None]*dZ,0).sum(1)
|
||||||
|
|
||||||
def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S):
|
def dpsi2_dmuS(self,Z,mu,S,target_mu,target_S):
|
||||||
"""Think N,M,M,Q """
|
"""Think N,M,M,Q """
|
||||||
self._psi_computations(Z,mu,S)
|
self._psi_computations(Z,mu,S)
|
||||||
tmp = self._psi2[:,:,:,None]/self.lengthscales2/self._psi2_denom
|
tmp = self._psi2[:,:,:,None]/self.lengthscales2/self._psi2_denom
|
||||||
np.add(target_mu, -tmp*(2.*self._psi2_mudist),target_mu) #N,M,M,Q
|
target_mu += (partial*-tmp*2.*self._psi2_mudist).sum(1).sum(1)
|
||||||
np.add(target_S, tmp*(2.*self._psi2_mudist_sq-1), target_S) #N,M,M,Q
|
target_S += (partial*tmp*(2.*self._psi2_mudist_sq-1)).sum(1).sum(1)
|
||||||
|
|
||||||
def _K_computations(self,X,X2):
|
def _K_computations(self,X,X2):
|
||||||
if not (np.all(X==self._X) and np.all(X2==self._X2)):
|
if not (np.all(X==self._X) and np.all(X2==self._X2)):
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,10 @@ from .. import kern
|
||||||
from ..inference.likelihoods import likelihood
|
from ..inference.likelihoods import likelihood
|
||||||
from GP_regression import GP_regression
|
from GP_regression import GP_regression
|
||||||
|
|
||||||
|
#Still TODO:
|
||||||
|
# make use of slices properly (kernel can now do this)
|
||||||
|
# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
|
||||||
|
|
||||||
class sparse_GP_regression(GP_regression):
|
class sparse_GP_regression(GP_regression):
|
||||||
"""
|
"""
|
||||||
Variational sparse GP model (Regression)
|
Variational sparse GP model (Regression)
|
||||||
|
|
@ -22,6 +26,8 @@ class sparse_GP_regression(GP_regression):
|
||||||
:type kernel: a GPy kernel
|
:type kernel: a GPy kernel
|
||||||
:param Z: inducing inputs (optional, see note)
|
:param Z: inducing inputs (optional, see note)
|
||||||
:type Z: np.ndarray (M x Q) | None
|
:type Z: np.ndarray (M x Q) | None
|
||||||
|
:param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
|
||||||
|
:type X_uncertainty: np.ndarray (N x Q) | None
|
||||||
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
|
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
|
||||||
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
|
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
|
||||||
:type M: int
|
:type M: int
|
||||||
|
|
@ -31,7 +37,7 @@ class sparse_GP_regression(GP_regression):
|
||||||
:type normalize_(X|Y): bool
|
:type normalize_(X|Y): bool
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self,X,Y,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
|
def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
|
||||||
self.beta = beta
|
self.beta = beta
|
||||||
if Z is None:
|
if Z is None:
|
||||||
self.Z = np.random.permutation(X.copy())[:M]
|
self.Z = np.random.permutation(X.copy())[:M]
|
||||||
|
|
@ -40,10 +46,20 @@ class sparse_GP_regression(GP_regression):
|
||||||
assert Z.shape[1]==X.shape[1]
|
assert Z.shape[1]==X.shape[1]
|
||||||
self.Z = Z
|
self.Z = Z
|
||||||
self.M = Z.shape[1]
|
self.M = Z.shape[1]
|
||||||
|
if X_uncertainty is None:
|
||||||
|
self.has_uncertain_inputs=False
|
||||||
|
else:
|
||||||
|
assert X_uncertainty.shape==X.shape
|
||||||
|
self.has_uncertain_inputs=False
|
||||||
|
self.X_uncertainty = X_uncertainty
|
||||||
|
|
||||||
GP_regression.__init__(self, X, Y, kernel = kernel, normalize_X = normalize_X, normalize_Y = normalize_Y)
|
GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y)
|
||||||
self.trYYT = np.sum(np.square(self.Y))
|
self.trYYT = np.sum(np.square(self.Y))
|
||||||
|
|
||||||
|
#normalise X uncertainty also
|
||||||
|
if self.has_uncertain_inputs:
|
||||||
|
self.X_uncertainty /= np.square(self._Xstd)
|
||||||
|
|
||||||
def set_param(self, p):
|
def set_param(self, p):
|
||||||
self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
|
self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
|
||||||
self.beta = p[self.M*self.Q]
|
self.beta = p[self.M*self.Q]
|
||||||
|
|
@ -54,18 +70,23 @@ class sparse_GP_regression(GP_regression):
|
||||||
|
|
||||||
def _compute_kernel_matrices(self):
|
def _compute_kernel_matrices(self):
|
||||||
# kernel computations, using BGPLVM notation
|
# kernel computations, using BGPLVM notation
|
||||||
#TODO: the following can be switched out in the case of uncertain inputs (or the BGPLVM!)
|
|
||||||
#TODO: slices for psi statistics (easy enough)
|
#TODO: slices for psi statistics (easy enough)
|
||||||
|
|
||||||
self.Kmm = self.kern.K(self.Z)
|
self.Kmm = self.kern.K(self.Z)
|
||||||
self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
|
if self.has_uncertain_inputs:
|
||||||
self.psi1 = self.kern.K(self.Z,self.X)
|
self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
|
||||||
self.psi2 = np.dot(self.psi1,self.psi1.T)
|
self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
|
||||||
|
self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
|
||||||
|
else:
|
||||||
|
self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
|
||||||
|
self.psi1 = self.kern.K(self.Z,self.X)
|
||||||
|
self.psi2 = np.dot(self.psi1,self.psi1.T)
|
||||||
|
|
||||||
def _computations(self):
|
def _computations(self):
|
||||||
# TODO find routine to multiply triangular matrices
|
# TODO find routine to multiply triangular matrices
|
||||||
self.psi1Y = np.dot(self.psi1, self.Y)
|
self.V = self.beta*self.Y
|
||||||
self.psi1YYpsi1 = np.dot(self.psi1Y, self.psi1Y.T)
|
self.psi1V = np.dot(self.psi1, self.V)
|
||||||
|
self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
|
||||||
self.Lm = jitchol(self.Kmm)
|
self.Lm = jitchol(self.Kmm)
|
||||||
self.Lmi = chol_inv(self.Lm)
|
self.Lmi = chol_inv(self.Lm)
|
||||||
self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
|
self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
|
||||||
|
|
@ -77,25 +98,19 @@ class sparse_GP_regression(GP_regression):
|
||||||
self.LLambdai = np.dot(self.LBi, self.Lmi)
|
self.LLambdai = np.dot(self.LBi, self.Lmi)
|
||||||
self.trace_K = self.psi0 - np.trace(self.A)
|
self.trace_K = self.psi0 - np.trace(self.A)
|
||||||
self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
|
self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
|
||||||
self.C = mdot(self.LLambdai, self.psi1Y)
|
self.C = mdot(self.LLambdai, self.psi1V)
|
||||||
self.G = mdot(self.LBL_inv, self.psi1YYpsi1, self.LBL_inv.T)
|
self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T)
|
||||||
|
|
||||||
# Computes dL_dpsi
|
# Compute dL_dpsi
|
||||||
self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
|
self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
|
||||||
dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.Y) # this is sane.
|
dC_dpsi1 = (self.LLambdai.T[:,:, None, None] * self.V)
|
||||||
tmp = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
|
self.dL_dpsi1 = (dC_dpsi1*self.C[None,:,None,:]).sum(1).sum(-1)
|
||||||
self.dL_dpsi1 = self.beta2 * tmp
|
self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
|
||||||
self.dL_dpsi2 = (- 0.5 * self.D * self.beta * (self.LBL_inv - self.Kmmi)
|
|
||||||
- self.beta**3 * 0.5 * self.G)
|
|
||||||
|
|
||||||
# Computes dL_dKmm TODO: nicer precomputations
|
# Compute dL_dKmm
|
||||||
|
self.dL_dKmm = -0.5 * self.beta * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
|
||||||
tmp = self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi)
|
self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC
|
||||||
self.dL_dKmm = -self.beta * self.D * 0.5 * mdot(self.Lmi.T, self.A, self.Lmi) # dB
|
self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
|
||||||
self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - tmp - tmp.T + self.Kmmi) # dC
|
|
||||||
tmp = (mdot(self.LBL_inv, self.psi1YYpsi1, self.Kmmi)
|
|
||||||
- self.beta*mdot(self.G, self.psi2, self.Kmmi))
|
|
||||||
self.dL_dKmm += -0.5*self.beta2*(tmp + tmp.T - self.G) # dE
|
|
||||||
|
|
||||||
def get_param(self):
|
def get_param(self):
|
||||||
return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
|
return np.hstack([self.Z.flatten(),self.beta,self.kern.extract_param()])
|
||||||
|
|
@ -104,65 +119,83 @@ class sparse_GP_regression(GP_regression):
|
||||||
return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
|
return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern.extract_param_names()
|
||||||
|
|
||||||
def log_likelihood(self):
|
def log_likelihood(self):
|
||||||
|
"""
|
||||||
|
Compute the (lower bound on the) log marginal likelihood
|
||||||
|
"""
|
||||||
A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
|
A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
|
||||||
B = -0.5*self.beta*self.D*self.trace_K
|
B = -0.5*self.beta*self.D*self.trace_K
|
||||||
C = -self.D * np.sum(np.log(np.diag(self.LB)))
|
C = -self.D * np.sum(np.log(np.diag(self.LB)))
|
||||||
D = -0.5*self.beta*self.trYYT
|
D = -0.5*self.beta*self.trYYT
|
||||||
E = +0.5*self.beta2*np.sum(self.psi1YYpsi1 * self.LBL_inv)
|
E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
|
||||||
|
|
||||||
return A+B+C+D+E
|
return A+B+C+D+E
|
||||||
|
|
||||||
def dL_dbeta(self):
|
def dL_dbeta(self):
|
||||||
""" compute the gradient of the log likelihood wrt beta.
|
"""
|
||||||
TODO: suport heteroscedatic noise"""
|
Compute the gradient of the log likelihood wrt beta.
|
||||||
|
TODO: suport heteroscedatic noise
|
||||||
|
"""
|
||||||
|
|
||||||
dA_dbeta = 0.5 * self.N*self.D/self.beta
|
dA_dbeta = 0.5 * self.N*self.D/self.beta
|
||||||
dB_dbeta = - 0.5 * self.D * self.trace_K
|
dB_dbeta = - 0.5 * self.D * self.trace_K
|
||||||
dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)
|
dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)
|
||||||
dD_dbeta = - 0.5 * self.trYYT
|
dD_dbeta = - 0.5 * self.trYYT
|
||||||
tmp = mdot(self.LBi.T, self.LLambdai, self.psi1Y)
|
tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
|
||||||
dE_dbeta = (self.beta * np.sum(np.square(self.C)) - 0.5 * self.beta2
|
dE_dbeta = np.sum(np.square(self.C))/self.beta - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T))
|
||||||
* np.sum(self.A * np.dot(tmp, tmp.T)))
|
|
||||||
|
|
||||||
return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
|
return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
|
||||||
|
|
||||||
def dL_dtheta(self):
|
def dL_dtheta(self):
|
||||||
#re-cast computations in psi2 back to psi1:
|
"""
|
||||||
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
|
Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
|
||||||
|
"""
|
||||||
dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
|
dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
|
||||||
dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
|
if self.has_uncertain_inputs:
|
||||||
dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
|
dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
|
||||||
|
dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
|
||||||
|
dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
|
||||||
|
else:
|
||||||
|
#re-cast computations in psi2 back to psi1:
|
||||||
|
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
|
||||||
|
dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
|
||||||
|
dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
|
||||||
|
|
||||||
return dL_dtheta
|
return dL_dtheta
|
||||||
|
|
||||||
def dL_dZ(self):
|
def dL_dZ(self):
|
||||||
#re-cast computations in psi2 back to psi1:
|
"""
|
||||||
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
|
The derivative of the bound wrt the inducing inputs Z
|
||||||
|
"""
|
||||||
dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
|
dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
|
||||||
dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
|
if self.has_uncertain_inputs:
|
||||||
|
dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
|
||||||
|
dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
|
||||||
|
else:
|
||||||
|
#re-cast computations in psi2 back to psi1:
|
||||||
|
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
|
||||||
|
dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
|
||||||
return dL_dZ
|
return dL_dZ
|
||||||
|
|
||||||
def log_likelihood_gradients(self):
|
def log_likelihood_gradients(self):
|
||||||
return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
|
return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
|
||||||
|
|
||||||
def _raw_predict(self,Xnew,slices):
|
def _raw_predict(self, Xnew, slices):
|
||||||
"""Internal helper function for making predictions, does not account for normalisation"""
|
"""Internal helper function for making predictions, does not account for normalisation"""
|
||||||
|
|
||||||
Kx = self.kern.K(self.Z, Xnew)
|
Kx = self.kern.K(self.Z, Xnew)
|
||||||
Kxx = self.kern.K(Xnew)
|
Kxx = self.kern.K(Xnew)
|
||||||
|
|
||||||
mu = self.beta * mdot(Kx.T, self.LBL_inv, self.psi1Y)
|
mu = mdot(Kx.T, self.LBL_inv, self.psi1V)
|
||||||
var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
|
var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case.
|
||||||
return mu,var
|
return mu,var
|
||||||
|
|
||||||
def plot(self,*args,**kwargs):
|
def plot(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
|
Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
|
||||||
"""
|
"""
|
||||||
GP_regression.plot(self,*args,**kwargs)
|
GP_regression.plot(self,*args,**kwargs)
|
||||||
if self.Q==1:
|
if self.Q==1:
|
||||||
pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
|
pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
|
||||||
|
if self.has_uncertain_inputs:
|
||||||
|
pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
|
||||||
if self.Q==2:
|
if self.Q==2:
|
||||||
pb.plot(self.Z[:,0],self.Z[:,1],'wo')
|
pb.plot(self.Z[:,0],self.Z[:,1],'wo')
|
||||||
|
|
|
||||||
|
|
@ -1,70 +0,0 @@
|
||||||
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import pylab as pb
|
|
||||||
from ..util.linalg import mdot, jitchol, chol_inv, pdinv
|
|
||||||
from ..util.plot import gpplot
|
|
||||||
from .. import kern
|
|
||||||
from ..inference.likelihoods import likelihood
|
|
||||||
from sparse_GP_regression import sparse_GP_regression
|
|
||||||
|
|
||||||
class uncertain_input_GP_regression(sparse_GP_regression):
|
|
||||||
"""
|
|
||||||
Variational sparse GP model (Regression) with uncertainty on the inputs
|
|
||||||
|
|
||||||
:param X: inputs
|
|
||||||
:type X: np.ndarray (N x Q)
|
|
||||||
:param X_uncertainty: uncertainty on X (Gaussian variances, assumed isotrpic)
|
|
||||||
:type X_uncertainty: np.ndarray (N x Q)
|
|
||||||
:param Y: observed data
|
|
||||||
:type Y: np.ndarray of observations (N x D)
|
|
||||||
:param kernel : the kernel/covariance function. See link kernels
|
|
||||||
:type kernel: a GPy kernel
|
|
||||||
:param Z: inducing inputs (optional, see note)
|
|
||||||
:type Z: np.ndarray (M x Q) | None
|
|
||||||
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
|
|
||||||
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
|
|
||||||
:type M: int
|
|
||||||
:param beta: noise precision. TODO> ignore beta if doing EP
|
|
||||||
:type beta: float
|
|
||||||
:param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
|
|
||||||
:type normalize_(X|Y): bool
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self,X,Y,X_uncertainty,kernel=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False):
|
|
||||||
self.X_uncertainty = X_uncertainty
|
|
||||||
sparse_GP_regression.__init__(self, X, Y, kernel = kernel, beta = beta, normalize_X = normalize_X, normalize_Y = normalize_Y)
|
|
||||||
self.trYYT = np.sum(np.square(self.Y))
|
|
||||||
|
|
||||||
def _compute_kernel_matrices(self):
|
|
||||||
# kernel computations, using BGPLVM notation
|
|
||||||
#TODO: slices for psi statistics (easy enough)
|
|
||||||
self.Kmm = self.kern.K(self.Z)
|
|
||||||
self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
|
|
||||||
self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
|
|
||||||
self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
|
|
||||||
|
|
||||||
def dL_dtheta(self):
|
|
||||||
#re-cast computations in psi2 back to psi1:
|
|
||||||
dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
|
|
||||||
dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
|
|
||||||
dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
|
|
||||||
dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
|
|
||||||
return dL_dtheta
|
|
||||||
|
|
||||||
def dL_dZ(self):
|
|
||||||
dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
|
|
||||||
dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
|
|
||||||
dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
|
|
||||||
return dL_dZ
|
|
||||||
|
|
||||||
def plot(self,*args,**kwargs):
|
|
||||||
"""
|
|
||||||
Plot the fitted model: just call the sparse GP_regression plot function and then add
|
|
||||||
markers to represent uncertainty on the inputs
|
|
||||||
"""
|
|
||||||
sparse_GP_regression.plot(self,*args,**kwargs)
|
|
||||||
if self.Q==1:
|
|
||||||
pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
|
|
||||||
|
|
||||||
128
GPy/models/uncollapsed_sparse_GP.py
Normal file
128
GPy/models/uncollapsed_sparse_GP.py
Normal file
|
|
@ -0,0 +1,128 @@
|
||||||
|
# Copyright (c) 2012 James Hensman
|
||||||
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pylab as pb
|
||||||
|
from ..util.linalg import mdot, jitchol, chol_inv, pdinv
|
||||||
|
from ..util.plot import gpplot
|
||||||
|
from .. import kern
|
||||||
|
from ..inference.likelihoods import likelihood
|
||||||
|
from sparse_GP_regression import sparse_GP_regression
|
||||||
|
|
||||||
|
class uncollapsed_sparse_GP(sparse_GP_regression):
|
||||||
|
"""
|
||||||
|
Variational sparse GP model (Regression), where the approximating distribution q(u) is represented explicitly
|
||||||
|
|
||||||
|
:param X: inputs
|
||||||
|
:type X: np.ndarray (N x Q)
|
||||||
|
:param Y: observed data
|
||||||
|
:type Y: np.ndarray of observations (N x D)
|
||||||
|
:param q_u: canonical parameters of the distribution squasehd into a 1D array
|
||||||
|
:type q_u: np.ndarray
|
||||||
|
:param kernel : the kernel/covariance function. See link kernels
|
||||||
|
:type kernel: a GPy kernel
|
||||||
|
:param Z: inducing inputs (optional, see note)
|
||||||
|
:type Z: np.ndarray (M x Q) | None
|
||||||
|
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
|
||||||
|
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
|
||||||
|
:type M: int
|
||||||
|
:param beta: noise precision. TODO> ignore beta if doing EP
|
||||||
|
:type beta: float
|
||||||
|
:param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
|
||||||
|
:type normalize_(X|Y): bool
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, X, Y, q_u=None, *args, **kwargs)
|
||||||
|
D = Y.shape[1]
|
||||||
|
if q_u is None:
|
||||||
|
if Z is None:
|
||||||
|
M = Z.shape[0]
|
||||||
|
else:
|
||||||
|
M=M
|
||||||
|
self.set_vb_param(np.hstack((np.ones(M*D)),np.eye(M).flatten()))
|
||||||
|
sparse_GP_regression.__init__(self, X, Y, *args, **kwargs)
|
||||||
|
|
||||||
|
def _computations(self):
|
||||||
|
self.V = self.beta*self.Y
|
||||||
|
self.psi1V = np.dot(self.psi1, self.V)
|
||||||
|
self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
|
||||||
|
self.Lm = jitchol(self.Kmm)
|
||||||
|
self.Lmi = chol_inv(self.Lm)
|
||||||
|
self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
|
||||||
|
self.A = mdot(self.Lmi, self.psi2, self.Lmi.T)
|
||||||
|
self.B = np.eye(self.M) + self.beta * self.A
|
||||||
|
self.Lambda = mdot(self.Lmi.T,self.B,sel.Lmi)
|
||||||
|
|
||||||
|
# Compute dL_dpsi
|
||||||
|
self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
|
||||||
|
self.dL_dpsi1 =
|
||||||
|
self.dL_dpsi2 =
|
||||||
|
|
||||||
|
# Compute dL_dKmm
|
||||||
|
self.dL_dKmm =
|
||||||
|
self.dL_dKmm +=
|
||||||
|
self.dL_dKmm +=
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
"""
|
||||||
|
Compute the (lower bound on the) log marginal likelihood
|
||||||
|
"""
|
||||||
|
A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
|
||||||
|
B = -0.5*self.beta*self.D*self.trace_K
|
||||||
|
C = -self.D *(self.Kmm_hld +0.5*np.sum(self.Lambda * self.mmT_S) + self.M/2.)
|
||||||
|
E = -0.5*self.beta*self.trYYT
|
||||||
|
F = np.sum(np.dot(self.V.T,self.projected_mean))
|
||||||
|
return A+B+C+D+E+F
|
||||||
|
|
||||||
|
def dL_dbeta(self):
|
||||||
|
"""
|
||||||
|
Compute the gradient of the log likelihood wrt beta.
|
||||||
|
TODO: suport heteroscedatic noise
|
||||||
|
"""
|
||||||
|
dA_dbeta = 0.5 * self.N*self.D/self.beta
|
||||||
|
dB_dbeta = - 0.5 * self.D * self.trace_K
|
||||||
|
dC_dbeta = - 0.5 * self.D * #TODO
|
||||||
|
dD_dbeta = - 0.5 * self.trYYT
|
||||||
|
|
||||||
|
return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
|
||||||
|
|
||||||
|
def _raw_predict(self, Xnew, slices):
|
||||||
|
"""Internal helper function for making predictions, does not account for normalisation"""
|
||||||
|
|
||||||
|
#TODO
|
||||||
|
return mu,var
|
||||||
|
|
||||||
|
def set_vb_param(self,vb_param):
|
||||||
|
"""set the distribution q(u) from the canonical parameters"""
|
||||||
|
self.q_u_prec = -2.*vb_param[self.M*self.D:].reshape(self.M,self.M)
|
||||||
|
self.q_u_prec_L = jitchol(self.q_u_prec)
|
||||||
|
self.q_u_cov_L = chol_inv(self.q_u_prec_L)
|
||||||
|
self.q_u_cov = np.dot(self.q_u_cov_L,self.q_u_cov_L.T)
|
||||||
|
self.q_u_mean = -2.*np.dot(self.q_u_cov,vb_param[:self.M*self.D].reshape(self.M,self.D))
|
||||||
|
|
||||||
|
self.q_u_expectation = (self.q_u_mean, np.dot(self.q_u_mean,self.q_u_mean.T)+self.q_u_cov)
|
||||||
|
|
||||||
|
self.q_u_canonical = (np.dot(self.q_u_prec, self.q_u_mean),-0.5*self.q_u_prec)
|
||||||
|
#TODO: computations now?
|
||||||
|
|
||||||
|
def get_vb_param(self):
|
||||||
|
"""
|
||||||
|
Return the canonical parameters of the distribution q(u)
|
||||||
|
"""
|
||||||
|
return np.hstack([e.flatten() for e in self.q_u_canonical])
|
||||||
|
|
||||||
|
def vb_grad_natgrad(self):
|
||||||
|
"""
|
||||||
|
Compute the gradients of the lower bound wrt the canonical and
|
||||||
|
Expectation parameters of u.
|
||||||
|
|
||||||
|
Note that the natural gradient in either is given by the gradient in the other (See Hensman et al 2012 Fast Variational inference in the conjugate exponential Family)
|
||||||
|
"""
|
||||||
|
foobar #TODO
|
||||||
|
|
||||||
|
def plot(self, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
add the distribution q(u) to the plot from sparse_GP_regression
|
||||||
|
"""
|
||||||
|
sparse_GP_regression.plot(self,*args,**kwargs)
|
||||||
|
#TODO: plot the q(u) dist.
|
||||||
Loading…
Add table
Add a link
Reference in a new issue