Merge remote-tracking branch 'Falkor/newGP' into newGP

This commit is contained in:
Ricardo Andrade 2013-01-28 10:46:33 +00:00
commit 058cee5895
11 changed files with 905 additions and 251 deletions

View file

@ -76,11 +76,10 @@ def toy_linear_1d_classification(model_type='Full', inducing=4, seed=default_see
# create simple GP model
if model_type=='Full':
m = GPy.models.simple_GP_EP(data['X'],likelihood)
m = GPy.models.GP_EP(data['X'],likelihood)
else:
# create sparse GP EP model
m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type)
m.constrain_positive('var')
m.constrain_positive('len')

43
GPy/examples/ep_fix.py Normal file
View file

@ -0,0 +1,43 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
"""
Simple Gaussian Processes classification
"""
import pylab as pb
import numpy as np
import GPy
pb.ion()
pb.close('all')
default_seed=10000
model_type='Full'
inducing=4
seed=default_seed
"""Simple 1D classification example.
:param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
:param seed : seed value for data generation (default is 4).
:type seed: int
:param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
:type inducing: int
"""
data = GPy.util.datasets.toy_linear_1d_classification(seed=seed)
likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1])
m = GPy.models.GP(data['X'],likelihood=likelihood)
#m = GPy.models.GP(data['X'],Y=likelihood.Y)
m.constrain_positive('var')
m.constrain_positive('len')
m.tie_param('lengthscale')
if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian):
m.approximate_likelihood()
print m.checkgrad()
# Optimize and plot
m.optimize()
#m.em(plot_all=False) # EM algorithm
m.plot()
print(m)

50
GPy/examples/poisson.py Normal file
View file

@ -0,0 +1,50 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
"""
Simple Gaussian Processes classification
"""
import pylab as pb
import numpy as np
import GPy
pb.ion()
pb.close('all')
default_seed=10000
model_type='Full'
inducing=4
seed=default_seed
"""Simple 1D classification example.
:param model_type: type of model to fit ['Full', 'FITC', 'DTC'].
:param seed : seed value for data generation (default is 4).
:type seed: int
:param inducing : number of inducing variables (only used for 'FITC' or 'DTC').
:type inducing: int
"""
X = np.arange(0,100,5)[:,None]
F = np.round(np.sin(X/18.) + .1*X)
E = np.random.randint(-3,3,20)[:,None]
Y = F + E
pb.plot(X,F,'k-')
pb.plot(X,Y,'ro')
pb.figure()
likelihood = GPy.inference.likelihoods.poisson(Y,scale=4.)
m = GPy.models.GP(X,likelihood=likelihood)
#m = GPy.models.GP(data['X'],Y=likelihood.Y)
m.constrain_positive('var')
m.constrain_positive('len')
m.tie_param('lengthscale')
if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian):
m.approximate_likelihood()
print m.checkgrad()
# Optimize and plot
m.optimize()
#m.em(plot_all=False) # EM algorithm
m.plot()
print(m)

View file

@ -0,0 +1,76 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
"""
Sparse Gaussian Processes regression with an RBF kernel
"""
import pylab as pb
import numpy as np
import GPy
np.random.seed(2)
pb.ion()
N = 500
M = 5
######################################
## 1 dimensional example
# sample inputs and outputs
X = np.random.uniform(-3.,3.,(N,1))
#Y = np.sin(X)+np.random.randn(N,1)*0.05
F = np.sin(X)+np.random.randn(N,1)*0.05
Y = np.ones([F.shape[0],1])
Y[F<0] = -1
likelihood = GPy.inference.likelihoods.probit(Y)
# construct kernel
rbf = GPy.kern.rbf(1)
noise = GPy.kern.white(1)
kernel = rbf + noise
# create simple GP model
#m1 = GPy.models.sparse_GP_regression(X, Y, kernel, M=M)
m1 = GPy.models.sparse_GP(X, kernel, M=M,likelihood= likelihood)
# contrain all parameters to be positive
m1.constrain_positive('(variance|lengthscale|precision)')
#m1.constrain_positive('(variance|lengthscale)')
#m1.constrain_fixed('prec',10.)
#check gradient FIXME unit test please
m1.checkgrad()
# optimize and plot
m1.optimize('tnc', messages = 1)
m1.plot()
# print(m1)
######################################
## 2 dimensional example
# # sample inputs and outputs
# X = np.random.uniform(-3.,3.,(N,2))
# Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05
# # construct kernel
# rbf = GPy.kern.rbf(2)
# noise = GPy.kern.white(2)
# kernel = rbf + noise
# # create simple GP model
# m2 = GPy.models.sparse_GP_regression(X,Y,kernel, M = 50)
# create simple GP model
# # contrain all parameters to be positive (but not inducing inputs)
# m2.constrain_positive('(variance|lengthscale|precision)')
# #check gradient FIXME unit test please
# m2.checkgrad()
# # optimize and plot
# pb.figure()
# m2.optimize('tnc', messages = 1)
# m2.plot()
# print(m2)

View file

@ -1,9 +1,6 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
import random
import pylab as pb #TODO erase me
from scipy import stats, linalg
from .likelihoods import likelihood
from ..core import model
@ -11,21 +8,45 @@ from ..util.linalg import pdinv,mdot,jitchol
from ..util.plot import gpplot
from .. import kern
class EP_base:
"""
Expectation Propagation.
class EP:
def __init__(self,covariance,likelihood,Kmn=None,Knn_diag=None,epsilon=1e-3,power_ep=[1.,1.]):
"""
Expectation Propagation
This is just the base class for expectation propagation. We'll extend it for full and sparse EP.
"""
def __init__(self,likelihood,epsilon=1e-3,powerep=[1.,1.]):
Arguments
---------
X : input observations
likelihood : Output's likelihood (likelihood class)
kernel : a GPy kernel (kern class)
inducing : Either an array specifying the inducing points location or a sacalar defining their number. None value for using a non-sparse model is used.
power_ep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
"""
self.likelihood = likelihood
assert covariance.shape[0] == covariance.shape[1]
if Kmn is not None:
self.Kmm = covariance
self.Kmn = Kmn
self.M = self.Kmn.shape[0]
self.N = self.Kmn.shape[1]
assert self.M < self.N, 'The number of inducing inputs must be smaller than the number of observations'
else:
self.K = covariance
self.N = self.K.shape[0]
if Knn_diag is not None:
self.Knn_diag = Knn_diag
assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N'
self.epsilon = epsilon
self.eta, self.delta = powerep
self.eta, self.delta = power_ep
self.jitter = 1e-12
#Initial values - Likelihood approximation parameters:
#p(y|f) = t(f|tau_tilde,v_tilde)
self.restart_EP()
"""
Initial values - Likelihood approximation parameters:
p(y|f) = t(f|tau_tilde,v_tilde)
"""
self.tau_tilde = np.zeros(self.N)
self.v_tilde = np.zeros(self.N)
def restart_EP(self):
"""
@ -35,26 +56,11 @@ class EP_base:
self.v_tilde = np.zeros(self.N)
self.mu = np.zeros(self.N)
class Full(EP_base):
"""
:param likelihood: Output's likelihood (e.g. probit)
:type likelihood: GPy.inference.likelihood instance
:param K: prior covariance matrix
:type K: np.ndarray (N x N)
:param likelihood: Output's likelihood (e.g. probit)
:type likelihood: GPy.inference.likelihood instance
:param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
:param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
"""
def __init__(self,K,likelihood,*args,**kwargs):
assert K.shape[0] == K.shape[1]
self.K = K
self.N = self.K.shape[0]
EP_base.__init__(self,likelihood,*args,**kwargs)
def fit_EP(self,messages=False):
class Full(EP):
def fit_EP(self):
"""
The expectation-propagation algorithm.
For nomenclature see Rasmussen & Williams 2006 (pag. 52-60)
For nomenclature see Rasmussen & Williams 2006.
"""
#Prior distribution parameters: p(f|X) = N(f|0,K)
#self.K = self.kernel.K(self.X,self.X)
@ -69,16 +75,15 @@ class Full(EP_base):
sigma_ = 1./tau_
mu_ = v_/tau_
"""
self.tau_ = np.empty(self.N,dtype=np.float64)
self.v_ = np.empty(self.N,dtype=np.float64)
self.tau_ = np.empty(self.N,dtype=float)
self.v_ = np.empty(self.N,dtype=float)
#Initial values - Marginal moments
z = np.empty(self.N,dtype=np.float64)
self.Z_hat = np.empty(self.N,dtype=np.float64)
phi = np.empty(self.N,dtype=np.float64)
mu_hat = np.empty(self.N,dtype=np.float64)
sigma2_hat = np.empty(self.N,dtype=np.float64)
z = np.empty(self.N,dtype=float)
self.Z_hat = np.empty(self.N,dtype=float)
phi = np.empty(self.N,dtype=float)
mu_hat = np.empty(self.N,dtype=float)
sigma2_hat = np.empty(self.N,dtype=float)
#Approximation
epsilon_np1 = self.epsilon + 1.
@ -87,7 +92,8 @@ class Full(EP_base):
self.np1 = [self.tau_tilde.copy()]
self.np2 = [self.v_tilde.copy()]
while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
update_order = np.random.permutation(self.N)
update_order = np.arange(self.N)
random.shuffle(update_order)
for i in update_order:
#Cavity distribution parameters
self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i]
@ -104,6 +110,7 @@ class Full(EP_base):
self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T)
self.mu = np.dot(self.Sigma,self.v_tilde)
self.iterations += 1
print self.tau_tilde[i] #TODO erase me
#Sigma recomptutation with Cholesky decompositon
Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*(self.K)
B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K
@ -111,35 +118,98 @@ class Full(EP_base):
V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1)
self.Sigma = self.K - np.dot(V.T,V)
self.mu = np.dot(self.Sigma,self.v_tilde)
epsilon_np1 = np.mean(self.tau_tilde-self.np1[-1]**2)
epsilon_np2 = np.mean(self.v_tilde-self.np2[-1]**2)
epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
self.np1.append(self.tau_tilde.copy())
self.np2.append(self.v_tilde.copy())
if messages:
print "EP iteration %i, epsiolon %d"%(self.iterations,epsilon_np1)
return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None]
class FITC(EP_base):
"""
:param likelihood: Output's likelihood (e.g. probit)
:type likelihood: GPy.inference.likelihood instance
:param Knn_diag: The diagonal elements of Knn is a 1D vector
:param Kmn: The 'cross' variance between inducing inputs and data
:param Kmm: the covariance matrix of the inducing inputs
:param likelihood: Output's likelihood (e.g. probit)
:type likelihood: GPy.inference.likelihood instance
:param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
:param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
"""
def __init__(self,likelihood,Knn_diag,Kmn,Kmm,*args,**kwargs):
self.Knn_diag = Knn_diag
self.Kmn = Kmn
self.Kmm = Kmm
self.M = self.Kmn.shape[0]
self.N = self.Kmn.shape[1]
assert self.M <= self.N, 'The number of inducing inputs must be smaller than the number of observations'
assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N'
EP_base.__init__(self,likelihood,*args,**kwargs)
class DTC(EP):
def fit_EP(self):
"""
The expectation-propagation algorithm with sparse pseudo-input.
For nomenclature see ... 2013.
"""
"""
Prior approximation parameters:
q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0)
Sigma0 = Qnn = Knm*Kmmi*Kmn
"""
self.Kmmi, self.Kmm_hld = pdinv(self.Kmm)
self.KmnKnm = np.dot(self.Kmn, self.Kmn.T)
self.KmmiKmn = np.dot(self.Kmmi,self.Kmn)
self.Qnn_diag = np.sum(self.Kmn*self.KmmiKmn,-2)
self.LLT0 = self.Kmm.copy()
"""
Posterior approximation: q(f|y) = N(f| mu, Sigma)
Sigma = Diag + P*R.T*R*P.T + K
mu = w + P*gamma
"""
self.mu = np.zeros(self.N)
self.LLT = self.Kmm.copy()
self.Sigma_diag = self.Qnn_diag.copy()
"""
Initial values - Cavity distribution parameters:
q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)}
sigma_ = 1./tau_
mu_ = v_/tau_
"""
self.tau_ = np.empty(self.N,dtype=float)
self.v_ = np.empty(self.N,dtype=float)
#Initial values - Marginal moments
z = np.empty(self.N,dtype=float)
self.Z_hat = np.empty(self.N,dtype=float)
phi = np.empty(self.N,dtype=float)
mu_hat = np.empty(self.N,dtype=float)
sigma2_hat = np.empty(self.N,dtype=float)
#Approximation
epsilon_np1 = 1
epsilon_np2 = 1
self.iterations = 0
self.np1 = [self.tau_tilde.copy()]
self.np2 = [self.v_tilde.copy()]
while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
update_order = np.arange(self.N)
random.shuffle(update_order)
for i in update_order:
#Cavity distribution parameters
self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i]
self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i]
#Marginal moments
self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i])
#Site parameters update
Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i])
Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i])
self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau
self.v_tilde[i] = self.v_tilde[i] + Delta_v
#Posterior distribution parameters update
self.LLT = self.LLT + np.outer(self.Kmn[:,i],self.Kmn[:,i])*Delta_tau
L = jitchol(self.LLT)
V,info = linalg.flapack.dtrtrs(L,self.Kmn,lower=1)
self.Sigma_diag = np.sum(V*V,-2)
si = np.sum(V.T*V[:,i],-1)
self.mu = self.mu + (Delta_v-Delta_tau*self.mu[i])*si
self.iterations += 1
#Sigma recomputation with Cholesky decompositon
self.LLT0 = self.LLT0 + np.dot(self.Kmn*self.tau_tilde[None,:],self.Kmn.T)
self.L = jitchol(self.LLT)
V,info = linalg.flapack.dtrtrs(L,self.Kmn,lower=1)
V2,info = linalg.flapack.dtrtrs(L.T,V,lower=0)
self.Sigma_diag = np.sum(V*V,-2)
Knmv_tilde = np.dot(self.Kmn,self.v_tilde)
self.mu = np.dot(V2.T,Knmv_tilde)
epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
self.np1.append(self.tau_tilde.copy())
self.np2.append(self.v_tilde.copy())
return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None]
class FITC(EP):
def fit_EP(self):
"""
The expectation-propagation algorithm with sparse pseudo-input.
@ -178,15 +248,15 @@ class FITC(EP_base):
sigma_ = 1./tau_
mu_ = v_/tau_
"""
self.tau_ = np.empty(self.N,dtype=np.float64)
self.v_ = np.empty(self.N,dtype=np.float64)
self.tau_ = np.empty(self.N,dtype=float)
self.v_ = np.empty(self.N,dtype=float)
#Initial values - Marginal moments
z = np.empty(self.N,dtype=np.float64)
self.Z_hat = np.empty(self.N,dtype=np.float64)
phi = np.empty(self.N,dtype=np.float64)
mu_hat = np.empty(self.N,dtype=np.float64)
sigma2_hat = np.empty(self.N,dtype=np.float64)
z = np.empty(self.N,dtype=float)
self.Z_hat = np.empty(self.N,dtype=float)
phi = np.empty(self.N,dtype=float)
mu_hat = np.empty(self.N,dtype=float)
sigma2_hat = np.empty(self.N,dtype=float)
#Approximation
epsilon_np1 = 1
@ -238,3 +308,4 @@ class FITC(EP_base):
epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
self.np1.append(self.tau_tilde.copy())
self.np2.append(self.v_tilde.copy())
return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None]

View file

@ -21,7 +21,7 @@ class likelihood:
self.location = location
self.scale = scale
def plot1Da(self,X_new,Mean_new,Var_new,X_u,Mean_u,Var_u):
def plot1Da(self,X,mean,var,Z=None,mean_Z=None,var_Z=None):
"""
Plot the predictive distribution of the GP model for 1-dimensional inputs
@ -32,10 +32,18 @@ class likelihood:
:param Mean_u: mean values at X_u
:param Var_new: variance values at X_u
"""
assert X_new.shape[1] == 1, 'Number of dimensions must be 1'
gpplot(X_new,Mean_new,Var_new)
pb.errorbar(X_u,Mean_u,2*np.sqrt(Var_u),fmt='r+')
pb.plot(X_u,Mean_u,'ro')
assert X.shape[1] == 1, 'Number of dimensions must be 1'
gpplot(X,mean,var.flatten())
pb.errorbar(Z.flatten(),mean_Z.flatten(),2*np.sqrt(var_Z.flatten()),fmt='r+')
pb.plot(Z,mean_Z,'ro')
def plot1Db(self,X_obs,X,phi,Z=None):
assert X_obs.shape[1] == 1, 'Number of dimensions must be 1'
gpplot(X,phi,np.zeros(X.shape[0]))
pb.plot(X_obs,(self.Y+1)/2,'kx',mew=1.5)
pb.ylim(-0.2,1.2)
if Z is not None:
pb.plot(Z,Z*0+.5,'r|',mew=1.5,markersize=12)
def plot2D(self,X,X_new,F_new,U=None):
"""
@ -90,16 +98,11 @@ class probit(likelihood):
sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat)
return Z_hat, mu_hat, sigma2_hat
def plot1Db(self,X,X_new,F_new,U=None):
assert X.shape[1] == 1, 'Number of dimensions must be 1'
gpplot(X_new,F_new,np.zeros(X_new.shape[0]))
pb.plot(X,(self.Y+1)/2,'kx',mew=1.5)
pb.ylim(-0.2,1.2)
if U is not None:
pb.plot(U,U*0+.5,'r|',mew=1.5,markersize=12)
def predictive_mean(self,mu,variance):
return stats.norm.cdf(mu/np.sqrt(1+variance))
def predictive_mean(self,mu,var):
mu = mu.flatten()
var = var.flatten()
return stats.norm.cdf(mu/np.sqrt(1+var))
def _log_likelihood_gradients():
raise NotImplementedError

312
GPy/models/GP.py Normal file
View file

@ -0,0 +1,312 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
import pylab as pb
from .. import kern
from ..core import model
from ..util.linalg import pdinv,mdot
from ..util.plot import gpplot, Tango
from ..inference.EP import Full
from ..inference.likelihoods import likelihood,probit,poisson,gaussian
class GP(model):
"""
Gaussian Process model for regression
:param X: input observations
:param Y: observed values
:param kernel: a GPy kernel, defaults to rbf+white
:param normalize_X: whether to normalize the input data before computing (predictions will be in original scales)
:type normalize_X: False|True
:param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales)
:type normalize_Y: False|True
:param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing)
:rtype: model object
.. Note:: Multiple independent outputs are allowed using columns of Y
"""
def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,epsion_em=.1,power_ep=[1.,1.]):
#TODO: make beta parameter explicit
# parse arguments
self.Xslices = Xslices
self.X = X
self.N, self.Q = self.X.shape
assert len(self.X.shape)==2
if kernel is None:
kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1])
else:
assert isinstance(kernel, kern.kern)
self.kern = kernel
#here's some simple normalisation
if normalize_X:
self._Xmean = X.mean(0)[None,:]
self._Xstd = X.std(0)[None,:]
self.X = (X.copy() - self._Xmean) / self._Xstd
if hasattr(self,'Z'):
self.Z = (self.Z - self._Xmean) / self._Xstd
else:
self._Xmean = np.zeros((1,self.X.shape[1]))
self._Xstd = np.ones((1,self.X.shape[1]))
# Y - likelihood related variables, these might change whether using EP or not
if likelihood is None:
assert Y is not None, "Either Y or likelihood must be defined"
self.likelihood = gaussian(Y)
else:
self.likelihood = likelihood
assert len(self.likelihood.Y.shape)==2
assert self.X.shape[0] == self.likelihood.Y.shape[0]
self.N, self.D = self.likelihood.Y.shape
if isinstance(self.likelihood,gaussian):
self.EP = False
self.Y = Y
#here's some simple normalisation
if normalize_Y:
self._Ymean = Y.mean(0)[None,:]
self._Ystd = Y.std(0)[None,:]
self.Y = (Y.copy()- self._Ymean) / self._Ystd
else:
self._Ymean = np.zeros((1,self.Y.shape[1]))
self._Ystd = np.ones((1,self.Y.shape[1]))
if self.D > self.N:
# then it's more efficient to store YYT
self.YYT = np.dot(self.Y, self.Y.T)
else:
self.YYT = None
else:
# Y is defined after approximating the likelihood
self.EP = True
self.eta,self.delta = power_ep
self.epsilon_ep = epsilon_ep
self.tau_tilde = np.ones([self.N,self.D])
self.v_tilde = np.zeros([self.N,self.D])
self.tau_ = np.ones([self.N,self.D])
self.v_ = np.zeros([self.N,self.D])
self.Z_hat = np.ones([self.N,self.D])
model.__init__(self)
def _set_params(self,p):
# TODO: remove beta when using EP
self.kern._set_params_transformed(p)
if not self.EP:
self.K = self.kern.K(self.X,slices1=self.Xslices)
self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
else:
self._ep_covariance()
def _get_params(self):
# TODO: remove beta when using EP
return self.kern._get_params_transformed()
def _get_param_names(self):
# TODO: remove beta when using EP
return self.kern._get_param_names_transformed()
def approximate_likelihood(self):
assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods"
self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
self.tau_tilde, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP()
# Y: EP likelihood is defined as a regression model for mu_tilde
self.Y = self.v_tilde/self.tau_tilde
self._Ymean = np.zeros((1,self.Y.shape[1]))
self._Ystd = np.ones((1,self.Y.shape[1]))
if self.D > self.N:
# then it's more efficient to store YYT
self.YYT = np.dot(self.Y, self.Y.T)
else:
self.YYT = None
self.mu_ = self.v_/self.tau_
self._ep_covariance()
def _ep_covariance(self):
# Kernel plus noise variance term
self.K = self.kern.K(self.X,slices1=self.Xslices) + np.diag(1./self.tau_tilde.flatten())
self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)
def _model_fit_term(self):
"""
Computes the model fit using YYT if it's available
"""
if self.YYT is None:
return -0.5*np.sum(np.square(np.dot(self.Li,self.Y)))
else:
return -0.5*np.sum(np.multiply(self.Ki, self.YYT))
def _normalization_term(self):
"""
Computes the marginal likelihood normalization constants
"""
sigma_sum = 1./self.tau_ + 1./self.tau_tilde
mu_diff_2 = (self.mu_ - self.Y)**2
penalty_term = np.sum(np.log(self.Z_hat))
return penalty_term + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum)
def log_likelihood(self):
"""
The log marginal likelihood for an EP model can be written as the log likelihood of
a regression model for a new variable Y* = v_tilde/tau_tilde, with a covariance
matrix K* = K + diag(1./tau_tilde) plus a normalization term.
"""
complexity_term = -0.5*self.D*self.Kplus_logdet
normalization_term = 0 if self.EP == False else self.normalization_term()
return complexity_term + normalization_term + self._model_fit_term()
def log_likelihood(self):
complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet
return complexity_term + self._model_fit_term()
def dL_dK(self):
if self.YYT is None:
alpha = np.dot(self.Ki,self.Y)
dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki)
else:
dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki)
return dL_dK
def _log_likelihood_gradients(self):
return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X)
def predict(self,Xnew, slices=None, full_cov=False):
"""
Predict the function(s) at the new point(s) Xnew.
Arguments
---------
:param Xnew: The points at which to make a prediction
:type Xnew: np.ndarray, Nnew x self.Q
:param slices: specifies which outputs kernel(s) the Xnew correspond to (see below)
:type slices: (None, list of slice objects, list of ints)
:param full_cov: whether to return the folll covariance matrix, or just the diagonal
:type full_cov: bool
:rtype: posterior mean, a Numpy array, Nnew x self.D
:rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D)
.. Note:: "slices" specifies how the the points X_new co-vary wich the training points.
- If None, the new points covary throigh every kernel part (default)
- If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part
- If a list of booleans, specifying which kernel parts are active
If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew.
This is to allow for different normalisations of the output dimensions.
"""
#normalise X values
Xnew = (Xnew.copy() - self._Xmean) / self._Xstd
mu, var, phi = self._raw_predict(Xnew, slices, full_cov)
#un-normalise
mu = mu*self._Ystd + self._Ymean
if full_cov:
if self.D==1:
var *= np.square(self._Ystd)
else:
var = var[:,:,None] * np.square(self._Ystd)
else:
if self.D==1:
var *= np.square(np.squeeze(self._Ystd))
else:
var = var[:,None] * np.square(self._Ystd)
return mu,var,phi
def _raw_predict(self,_Xnew,slices, full_cov=False):
"""Internal helper function for making predictions, does not account for normalisation"""
Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices)
mu = np.dot(np.dot(Kx.T,self.Ki),self.Y)
KiKx = np.dot(self.Ki,Kx)
if full_cov:
Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices)
var = Kxx - np.dot(KiKx.T,Kx)
else:
Kxx = self.kern.Kdiag(_Xnew, slices=slices)
var = Kxx - np.sum(np.multiply(KiKx,Kx),0)
phi = None if not self.EP else self.likelihood.predictive_mean(mu,var)
return mu, var, phi
def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None):
"""
:param samples: the number of a posteriori samples to plot
:param which_data: which if the training data to plot (default all)
:type which_data: 'all' or a slice object to slice self.X, self.Y
:param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits
:param which_functions: which of the kernel functions to plot (additively)
:type which_functions: list of bools
:param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D
Plot the posterior of the GP.
- In one dimension, the function is plotted with a shaded region identifying two standard deviations.
- In two dimsensions, a contour-plot shows the mean predicted function
- In higher dimensions, we've no implemented this yet !TODO!
Can plot only part of the data and part of the posterior functions using which_data and which_functions
"""
if which_functions=='all':
which_functions = [True]*self.kern.Nparts
if which_data=='all':
which_data = slice(None)
X = self.X[which_data,:]
Y = self.Y[which_data,:]
Xorig = X*self._Xstd + self._Xmean
Yorig = Y*self._Ystd + self._Ymean if not self.EP else self.likelihood.Y
if plot_limits is None:
xmin,xmax = Xorig.min(0),Xorig.max(0)
xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
elif len(plot_limits)==2:
xmin, xmax = plot_limits
else:
raise ValueError, "Bad limits for plotting"
if self.X.shape[1]==1:
Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None]
m,v,phi = self.predict(Xnew,slices=which_functions)
if self.EP:
pb.subplot(211)
gpplot(Xnew,m,v)
if samples:
s = np.random.multivariate_normal(m.flatten(),v,samples)
pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8)
if not self.EP:
pb.plot(Xorig,Yorig,'kx',mew=1.5)
pb.xlim(xmin,xmax)
else:
pb.xlim(xmin,xmax)
pb.subplot(212)
self.likelihood.plot1Db(self.X,Xnew,phi)
pb.xlim(xmin,xmax)
elif self.X.shape[1]==2:
resolution = 50 or resolution
xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution]
Xtest = np.vstack((xx.flatten(),yy.flatten())).T
zz,vv,phi = self.predict(Xtest,slices=which_functions)
zz = zz.reshape(resolution,resolution)
pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet)
pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max())
pb.xlim(xmin[0],xmax[0])
pb.ylim(xmin[1],xmax[1])
else:
raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"

View file

@ -1,160 +0,0 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
import pylab as pb
from scipy import stats, linalg
from .. import kern
from ..inference.Expectation_Propagation import Full
from ..inference.likelihoods import likelihood,probit#,poisson,gaussian
from ..core import model
from ..util.linalg import pdinv,jitchol
from ..util.plot import gpplot
class GP_EP(model):
def __init__(self,X,likelihood,kernel=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]):
"""
Simple Gaussian Process with Non-Gaussian likelihood
Arguments
---------
:param X: input observations (NxD numpy.darray)
:param likelihood: a GPy likelihood (likelihood class)
:param kernel: a GPy kernel (kern class)
:param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 (float)
:param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list)
:rtype: GPy model class.
"""
if kernel is None:
kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1])
assert isinstance(kernel,kern.kern), 'kernel is not a kern instance'
self.likelihood = likelihood
self.Y = self.likelihood.Y
self.kernel = kernel
self.X = X
self.N, self.D = self.X.shape
self.eta,self.delta = powerep
self.epsilon_ep = epsilon_ep
self.jitter = 1e-12
self.K = self.kernel.K(self.X)
model.__init__(self)
def _set_params(self,p):
self.kernel._set_params_transformed(p)
def _get_params(self):
return self.kernel._get_params_transformed()
def _get_param_names(self):
return self.kernel._get_param_names_transformed()
def approximate_likelihood(self):
self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta])
self.ep_approx.fit_EP()
def posterior_param(self):
self.K = self.kernel.K(self.X)
self.Sroot_tilde_K = np.sqrt(self.ep_approx.tau_tilde)[:,None]*self.K
B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)[None,:]*self.Sroot_tilde_K
#self.L = np.linalg.cholesky(B)
self.L = jitchol(B)
V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1)
self.Sigma = self.K - np.dot(V.T,V)
self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde)
def log_likelihood(self):
"""
Returns
-------
The EP approximation to the log-marginal likelihood
"""
self.posterior_param()
mu_ = self.ep_approx.v_/self.ep_approx.tau_
L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L)))
L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:]))
L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde)
L3 = sum(np.log(self.ep_approx.Z_hat))
return L1 + L2A + L2B + L3
def _log_likelihood_gradients(self):
dK_dp = self.kernel.dK_dtheta(self.X)
self.dK_dp = dK_dp
aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
b = self.ep_approx.v_tilde - np.sqrt(self.ep_approx.tau_tilde)*linalg.flapack.dtrtrs(self.L.T,aux1)[0]
U,info_u = linalg.flapack.dtrtrs(self.L,np.diag(np.sqrt(self.ep_approx.tau_tilde)),lower=1)
dL_dK = 0.5*(np.outer(b,b)-np.dot(U.T,U))
self.dL_dK = dL_dK
return np.array([np.sum(dK_dpi*dL_dK) for dK_dpi in dK_dp.T])
def predict(self,X):
#TODO: check output dimensions
self.posterior_param()
K_x = self.kernel.K(self.X,X)
Kxx = self.kernel.K(X)
aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1)
aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0)
zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2
f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta)
v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1)
variance = Kxx - np.dot(v.T,v)
vdiag = np.diag(variance)
y=self.likelihood.predictive_mean(f,vdiag)
return f,vdiag,y
def plot(self):
"""
Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals.
"""
if self.X.shape[1]==1:
pb.figure()
xmin,xmax = self.X.min(),self.X.max()
xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin)
Xnew = np.linspace(xmin,xmax,100)[:,None]
mu_f, var_f, mu_phi = self.predict(Xnew)
pb.subplot(211)
self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma))
pb.subplot(212)
self.likelihood.plot1Db(self.X,Xnew,mu_phi)
elif self.X.shape[1]==2:
pb.figure()
x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0)
x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0)
x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min)
x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min)
axis1 = np.linspace(x1min,x1max,50)
axis2 = np.linspace(x2min,x2max,50)
XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)]
Xnew = np.c_[XX1.flatten(),XX2.flatten()]
f,v,p = self.predict(Xnew)
self.likelihood.plot2D(self.X,Xnew,p)
else:
raise NotImplementedError, "Cannot plot GPs with more than two input dimensions"
def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense
"""
Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved.
"""
self.epsilon_em = epsilon
log_likelihood_change = self.epsilon_em + 1.
self.parameters_path = [self.kernel._get_params()]
self.approximate_likelihood()
self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]]
self.log_likelihood_path = [self.log_likelihood()]
iteration = 0
while log_likelihood_change > self.epsilon_em:
print 'EM iteration', iteration
self.optimize(max_f_eval = max_f_eval)
log_likelihood_new = self.log_likelihood()
log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1]
if log_likelihood_change < 0:
print 'log_likelihood decrement'
self.kernel._set_params_transformed(self.parameters_path[-1])
self.kernM._set_params_transformed(self.parameters_path[-1])
else:
self.approximate_likelihood()
self.log_likelihood_path.append(self.log_likelihood())
self.parameters_path.append(self.kernel._get_params())
self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde])
iteration += 1

View file

@ -6,7 +6,8 @@ from GP_regression import GP_regression
from sparse_GP_regression import sparse_GP_regression
from GPLVM import GPLVM
from warped_GP import warpedGP
from GP_EP import GP_EP
from generalized_FITC import generalized_FITC
from sparse_GPLVM import sparse_GPLVM
from uncollapsed_sparse_GP import uncollapsed_sparse_GP
from GP import GP
from sparse_GP import sparse_GP

View file

@ -9,7 +9,8 @@ from .. import kern
from ..core import model
from ..util.linalg import pdinv,mdot
from ..util.plot import gpplot
from ..inference.Expectation_Propagation import FITC
#from ..inference.Expectation_Propagation import FITC
from ..inference.EP import FITC
from ..inference.likelihoods import likelihood,probit
class generalized_FITC(model):

258
GPy/models/sparse_GP.py Normal file
View file

@ -0,0 +1,258 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
import pylab as pb
from ..util.linalg import mdot, jitchol, chol_inv, pdinv
from ..util.plot import gpplot
from .. import kern
from GP import GP
from ..inference.EP import Full
from ..inference.likelihoods import likelihood,probit,poisson,gaussian
#Still TODO:
# make use of slices properly (kernel can now do this)
# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array)
class sparse_GP(GP):
"""
Variational sparse GP model (Regression)
:param X: inputs
:type X: np.ndarray (N x Q)
:param Y: observed data
:type Y: np.ndarray of observations (N x D)
:param kernel : the kernel/covariance function. See link kernels
:type kernel: a GPy kernel
:param Z: inducing inputs (optional, see note)
:type Z: np.ndarray (M x Q) | None
:param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance)
:type X_uncertainty: np.ndarray (N x Q) | None
:param Zslices: slices for the inducing inputs (see slicing TODO: link)
:param M : Number of inducing points (optional, default 10. Ignored if Z is not None)
:type M: int
:param beta: noise precision. TODO> ignore beta if doing EP
:type beta: float
:param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales)
:type normalize_(X|Y): bool
"""
def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,epsilon_em=.1,power_ep=[1.,1.]):
if Z is None:
self.Z = np.random.permutation(X.copy())[:M]
self.M = M
else:
assert Z.shape[1]==X.shape[1]
self.Z = Z
self.M = Z.shape[0]
if X_uncertainty is None:
self.has_uncertain_inputs=False
else:
assert X_uncertainty.shape==X.shape
self.has_uncertain_inputs=True
self.X_uncertainty = X_uncertainty
self.beta = beta #FIXME
GP.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,epsion_em=epsilon_em,power_ep=power_ep)
self.beta = beta if isinstance(likelihood,gaussian) else self.tau_tilde #TODO this should be defined in GP.__init__
#normalise X uncertainty also
if self.has_uncertain_inputs:
self.X_uncertainty /= np.square(self._Xstd)
def _set_params(self, p):
if not self.EP:
self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
self.beta = p[self.M*self.Q]
self.kern._set_params(p[self.Z.size + 1:])
self.beta2 = self.beta**2
self._compute_kernel_matrices()
self._computations()
else:
self.Z = p[:self.M*self.Q].reshape(self.M, self.Q)
self.kern._set_params(p[self.Z.size:])
#self._compute_kernel_matrices() this is replaced by _ep_covariance
self._ep_covariance()
self._ep_computations()
def approximate_likelihood(self):
assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods"
if self.ep_proxy == 'DTC':
self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
elif self.ep_proxy == 'FITC':
self.Knn_diag = self.kern.psi0(self.Z,self.X, self.X_uncertainty) #TODO psi0 already calculates this
self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.Knn_diag,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
else:
self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta])
self.beta, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP()
# Y: EP likelihood is defined as a regression model for mu_tilde
self.Y = self.v_tilde/self.beta
self._Ymean = np.zeros((1,self.Y.shape[1]))
self._Ystd = np.ones((1,self.Y.shape[1]))
self.trbetaYYT = np.sum(self.beta*np.square(self.Y))
if self.D > self.N:
# then it's more efficient to store YYT
self.YYT = np.dot(self.Y, self.Y.T)
else:
self.YYT = None
self.mu_ = self.v_/self.tau_
self._ep_covariance()
self._computations()
def _ep_covariance(self):
self.Kmm = self.kern.K(self.Z)
if self.has_uncertain_inputs:
self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) #FIXME include beta
else:
#self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
self.Knn_diag = self.kern.Kdiag(self.X,slices=self.Xslices)
self.psi0 = (self.beta*self.Knn_diag).sum() #TODO check dimensions
self.psi1 = self.kern.K(self.Z,self.X)
#self.psi2 = np.dot(self.psi1,self.psi1.T)
self.psi2 = np.dot(self.psi1,self.beta*self.psi1.T)
def _compute_kernel_matrices(self):
# kernel computations, using BGPLVM notation
#TODO: slices for psi statistics (easy enough)
self.Kmm = self.kern.K(self.Z)
if self.has_uncertain_inputs:
self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum()
self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T
self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)
else:
self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum()
self.psi1 = self.kern.K(self.Z,self.X)
self.psi2 = np.dot(self.psi1,self.psi1.T)
def _ep_computations(self):
# TODO find routine to multiply triangular matrices
self.V = self.beta*self.Y
self.psi1V = np.dot(self.psi1, self.V)
self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
#self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T)
self.A = mdot(self.Lmi, self.psi2, self.Lmi.T)
self.B = np.eye(self.M) + self.A
self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
self.LLambdai = np.dot(self.LBi, self.Lmi)
#self.trace_K = self.psi0 - np.sum(np.dot(self.Lmi,self.psi1)**2,-1) #TODO check
self.trace_K = self.psi0 - np.trace(self.A)
self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
self.C = mdot(self.LLambdai, self.psi1V)
self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T)
# Compute dL_dpsi
#self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N)
self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() * np.ones(self.N) #TODO check
self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T)
#self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G)
# Compute dL_dKmm
self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB
self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC
self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE
def _get_params(self):
if not self.EP:
return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()])
else:
return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()])
def _get_param_names(self):
if not self.EP:
return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed()
else:
return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed()
def log_likelihood(self):
"""
Compute the (lower bound on the) log marginal likelihood
"""
beta_logdet = self.N*self.D*np.log(self.beta) if not self.EP else self.D*np.sum(np.log(self.beta))
A = -0.5*self.N*self.D*(np.log(2.*np.pi)) - 0.5*beta_logdet
B = -0.5*self.beta*self.D*self.trace_K if not self.EP else -0.5*self.D*self.trace_K
C = -0.5*self.D * self.B_logdet
D = -0.5*self.beta*self.trYYT if not self.EP else -0.5*self.trbetaYYT
E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
return A+B+C+D+E
def dL_dbeta(self):
"""
Compute the gradient of the log likelihood wrt beta.
"""
#TODO: suport heteroscedatic noise
dA_dbeta = 0.5 * self.N*self.D/self.beta
dB_dbeta = - 0.5 * self.D * self.trace_K
dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta
dD_dbeta = - 0.5 * self.trYYT
tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V)
dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta
return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta)
def dL_dtheta(self):
"""
Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel
"""
dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z)
if self.has_uncertain_inputs:
dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty)
dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape
else:
#re-cast computations in psi2 back to psi1:
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X)
dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X)
return dL_dtheta
def dL_dZ(self):
"""
The derivative of the bound wrt the inducing inputs Z
"""
dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ
if self.has_uncertain_inputs:
dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty)
dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty)
else:
#re-cast computations in psi2 back to psi1:
dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1)
dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X)
return dL_dZ
def _log_likelihood_gradients(self):
return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()])
def _raw_predict(self, Xnew, slices, full_cov=False):
"""Internal helper function for making predictions, does not account for normalisation"""
Kx = self.kern.K(self.Z, Xnew)
mu = mdot(Kx.T, self.LBL_inv, self.psi1V)
if full_cov:
noise_term = np.eye(Xnew.shape[0])/self.beta if not self.EP else 0
Kxx = self.kern.K(Xnew)
var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + noise_term
else:
noise_term = 1./self.beta if not self.EP else 0
Kxx = self.kern.Kdiag(Xnew)
var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) + noise_term
return mu,var
def plot(self, *args, **kwargs):
"""
Plot the fitted model: just call the GP_regression plot function and then add inducing inputs
"""
GP_regression.plot(self,*args,**kwargs)
if self.Q==1:
pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12)
if self.has_uncertain_inputs:
pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten()))
if self.Q==2:
pb.plot(self.Z[:,0],self.Z[:,1],'wo')