Add eq_ode1 kern and ibp_lfm model

This commit is contained in:
cdguarnizo 2016-05-20 02:03:55 -05:00
parent a20f93850d
commit f844143353
4 changed files with 1150 additions and 0 deletions

View file

@ -24,6 +24,7 @@ from .src.ODE_st import ODE_st
from .src.ODE_t import ODE_t from .src.ODE_t import ODE_t
from .src.poly import Poly from .src.poly import Poly
from .src.eq_ode2 import EQ_ODE2 from .src.eq_ode2 import EQ_ODE2
from .src.eq_ode1 import EQ_ODE1
from .src.trunclinear import TruncLinear,TruncLinear_inf from .src.trunclinear import TruncLinear,TruncLinear_inf
from .src.splitKern import SplitKern,DEtime from .src.splitKern import SplitKern,DEtime
from .src.splitKern import DEtime as DiffGenomeKern from .src.splitKern import DEtime as DiffGenomeKern

612
GPy/kern/src/eq_ode1.py Normal file
View file

@ -0,0 +1,612 @@
# Copyright (c) 2014, Cristian Guarnizo.
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
from scipy.special import erf, erfcx
from .kern import Kern
from ...core.parameterization import Param
from paramz.transformations import Logexp
from paramz.caching import Cache_this
class EQ_ODE1(Kern):
"""
Covariance function for first order differential equation driven by an exponentiated quadratic covariance.
This outputs of this kernel have the form
.. math::
\frac{\text{d}y_j}{\text{d}t} = \sum_{i=1}^R w_{j,i} u_i(t-\delta_j) - d_jy_j(t)
where :math:`R` is the rank of the system, :math:`w_{j,i}` is the sensitivity of the :math:`j`th output to the :math:`i`th latent function, :math:`d_j` is the decay rate of the :math:`j`th output and :math:`u_i(t)` are independent latent Gaussian processes goverened by an exponentiated quadratic covariance.
:param output_dim: number of outputs driven by latent function.
:type output_dim: int
:param W: sensitivities of each output to the latent driving function.
:type W: ndarray (output_dim x rank).
:param rank: If rank is greater than 1 then there are assumed to be a total of rank latent forces independently driving the system, each with identical covariance.
:type rank: int
:param decay: decay rates for the first order system.
:type decay: array of length output_dim.
:param delay: delay between latent force and output response.
:type delay: array of length output_dim.
:param kappa: diagonal term that allows each latent output to have an independent component to the response.
:type kappa: array of length output_dim.
.. Note: see first order differential equation examples in GPy.examples.regression for some usage.
"""
def __init__(self, input_dim=2, output_dim=1, rank=1, W = None, lengthscale=None, decay=None, active_dims=None, name='eq_ode1'):
assert input_dim == 2, "only defined for 1 input dims"
super(EQ_ODE1, self).__init__(input_dim=input_dim, active_dims=active_dims, name=name)
self.rank = rank
self.output_dim = output_dim
if lengthscale is None:
lengthscale = .5 + np.random.rand(self.rank)
else:
lengthscale = np.asarray(lengthscale)
assert lengthscale.size in [1, self.rank], "Bad number of lengthscales"
if lengthscale.size != self.rank:
lengthscale = np.ones(self.rank)*lengthscale
if W is None:
W = .5*np.random.randn(self.output_dim, self.rank)/np.sqrt(self.rank)
else:
assert W.shape == (self.output_dim, self.rank)
if decay is None:
decay = np.ones(self.output_dim)
else:
decay = np.asarray(decay)
assert decay.size in [1, self.output_dim], "Bad number of decay"
if decay.size != self.output_dim:
decay = np.ones(self.output_dim)*decay
# if kappa is None:
# self.kappa = np.ones(self.output_dim)
# else:
# kappa = np.asarray(kappa)
# assert kappa.size in [1, self.output_dim], "Bad number of kappa"
# if decay.size != self.output_dim:
# decay = np.ones(self.output_dim)*kappa
#self.kappa = Param('kappa', kappa, Logexp())
#self.delay = Param('delay', delay, Logexp())
#self.is_normalized = True
#self.is_stationary = False
#self.gaussian_initial = False
self.lengthscale = Param('lengthscale', lengthscale, Logexp())
self.decay = Param('decay', decay, Logexp())
self.W = Param('W', W)
self.link_parameters(self.lengthscale, self.decay, self.W)
@Cache_this(limit=3)
def K(self, X, X2=None):
#This way is not working, indexes are lost after using k._slice_X
#index = np.asarray(X, dtype=np.int)
#index = index.reshape(index.size,)
if hasattr(X, 'values'):
X = X.values
index = np.int_(np.round(X[:, 1]))
index = index.reshape(index.size,)
X_flag = index[0] >= self.output_dim
if X2 is None:
if X_flag:
#Calculate covariance function for the latent functions
index -= self.output_dim
return self._Kuu(X, index)
else:
raise NotImplementedError
else:
#This way is not working, indexes are lost after using k._slice_X
#index2 = np.asarray(X2, dtype=np.int)
#index2 = index2.reshape(index2.size,)
if hasattr(X2, 'values'):
X2 = X2.values
index2 = np.int_(np.round(X2[:, 1]))
index2 = index2.reshape(index2.size,)
X2_flag = index2[0] >= self.output_dim
#Calculate cross-covariance function
if not X_flag and X2_flag:
index2 -= self.output_dim
return self._Kfu(X, index, X2, index2) #Kfu
else:
index -= self.output_dim
return self._Kfu(X2, index2, X, index).T #Kuf
#Calculate the covariance function for diag(Kff(X,X))
def Kdiag(self, X):
#This way is not working, indexes are lost after using k._slice_X
#index = np.asarray(X, dtype=np.int)
#index = index.reshape(index.size,)
if hasattr(X, 'values'):
X = X.values
index = np.int_(X[:, 1])
index = index.reshape(index.size,)
#terms that move along t
t = X[:, 0].reshape(X.shape[0], 1)
d = np.unique(index) #Output Indexes
B = self.decay.values[d]
S = self.W.values[d, :]
#Index transformation
indd = np.arange(self.output_dim)
indd[d] = np.arange(d.size)
index = indd[index]
B = B.reshape(B.size, 1)
#Terms that move along q
lq = self.lengthscale.values.reshape(1, self.rank)
S2 = S*S
kdiag = np.empty((t.size, ))
#Dx1 terms
c0 = (S2/B)*((.5*np.sqrt(np.pi))*lq)
#DxQ terms
nu = lq*(B*.5)
nu2 = nu*nu
#Nx1 terms
gamt = -2.*B
gamt = gamt[index]*t
#NxQ terms
t_lq = t/lq
# Upsilon Calculations
# Using wofz
#erfnu = erf(nu)
upm = np.exp(nu2[index, :] + lnDifErf( nu[index, :] ,t_lq+nu[index,:] ))
upm[t[:, 0] == 0, :] = 0.
upv = np.exp(nu2[index, :] + gamt + lnDifErf( -t_lq+nu[index,:], nu[index, :] ) )
upv[t[:, 0] == 0, :] = 0.
#Covariance calculation
#kdiag = np.sum(c0[index, :]*(upm-upv), axis=1)
kdiag = c0[index, :]*(upm-upv)
return kdiag
def update_gradients_full(self, dL_dK, X, X2 = None):
#index = np.asarray(X, dtype=np.int)
#index = index.reshape(index.size,)
if hasattr(X, 'values'):
X = X.values
self.decay.gradient = np.zeros(self.decay.shape)
self.W.gradient = np.zeros(self.W.shape)
self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
index = np.int_(np.round(X[:, 1]))
index = index.reshape(index.size,)
X_flag = index[0] >= self.output_dim
if X2 is None:
if X_flag: #Kuu or Kmm
index -= self.output_dim
tmp = dL_dK*self._gkuu_lq(X, index)
for q in np.unique(index):
ind = np.where(index == q)
self.lengthscale.gradient[q] = tmp[np.ix_(ind[0], ind[0])].sum()
else:
raise NotImplementedError
else: #Kfu or Knm
#index2 = np.asarray(X2, dtype=np.int)
#index2 = index2.reshape(index2.size,)
if hasattr(X2, 'values'):
X2 = X2.values
index2 = np.int_(np.round(X2[:, 1]))
index2 = index2.reshape(index2.size,)
X2_flag = index2[0] >= self.output_dim
if not X_flag and X2_flag: #Kfu
index2 -= self.output_dim
else: #Kuf
dL_dK = dL_dK.T #so we obtaing dL_Kfu
indtemp = index - self.output_dim
Xtemp = X
X = X2
X2 = Xtemp
index = index2
index2 = indtemp
glq, gSdq, gB = self._gkfu(X, index, X2, index2)
tmp = dL_dK*glq
for q in np.unique(index2):
ind = np.where(index2 == q)
self.lengthscale.gradient[q] = tmp[:, ind].sum()
tmpB = dL_dK*gB
tmp = dL_dK*gSdq
for d in np.unique(index):
ind = np.where(index == d)
self.decay.gradient[d] = tmpB[ind, :].sum()
for q in np.unique(index2):
ind2 = np.where(index2 == q)
self.W.gradient[d, q] = tmp[np.ix_(ind[0], ind2[0])].sum()
def update_gradients_diag(self, dL_dKdiag, X):
#index = np.asarray(X, dtype=np.int)
#index = index.reshape(index.size,)
if hasattr(X, 'values'):
X = X.values
self.decay.gradient = np.zeros(self.decay.shape)
self.W.gradient = np.zeros(self.W.shape)
self.lengthscale.gradient = np.zeros(self.lengthscale.shape)
index = np.int_(X[:, 1])
index = index.reshape(index.size,)
glq, gS, gB = self._gkdiag(X, index)
if dL_dKdiag.size == X.shape[0]:
dL_dKdiag = np.reshape(dL_dKdiag, (index.size, 1))
tmp = dL_dKdiag*glq
self.lengthscale.gradient = tmp.sum(0)
tmpB = dL_dKdiag*gB
tmp = dL_dKdiag*gS
for d in np.unique(index):
ind = np.where(index == d)
self.decay.gradient[d] = tmpB[ind, :].sum()
self.W.gradient[d, :] = tmp[ind].sum(0)
def gradients_X(self, dL_dK, X, X2=None):
#index = np.asarray(X, dtype=np.int)
#index = index.reshape(index.size,)
if hasattr(X, 'values'):
X = X.values
index = np.int_(np.round(X[:, 1]))
index = index.reshape(index.size,)
X_flag = index[0] >= self.output_dim
#If input_dim == 1, use this
#gX = np.zeros((X.shape[0], 1))
#Cheat to allow gradient for input_dim==2
gX = np.zeros(X.shape)
if X2 is None: #Kuu or Kmm
if X_flag:
index -= self.output_dim
gX[:, 0] = 2.*(dL_dK*self._gkuu_X(X, index)).sum(0)
return gX
else:
raise NotImplementedError
else: #Kuf or Kmn
#index2 = np.asarray(X2, dtype=np.int)
#index2 = index2.reshape(index2.size,)
if hasattr(X2, 'values'):
X2 = X2.values
index2 = np.int_(np.round(X2[:, 1]))
index2 = index2.reshape(index2.size,)
X2_flag = index2[0] >= self.output_dim
if X_flag and not X2_flag: #gradient of Kuf(Z, X) wrt Z
index -= self.output_dim
gX[:, 0] = (dL_dK*self._gkfu_z(X2, index2, X, index).T).sum(1)
return gX
else:
raise NotImplementedError
#---------------------------------------#
# Helper functions #
#---------------------------------------#
#Evaluation of squared exponential for LFM
def _Kuu(self, X, index):
index = index.reshape(index.size,)
t = X[:, 0].reshape(X.shape[0],)
lq = self.lengthscale.values.reshape(self.rank,)
lq2 = lq*lq
#Covariance matrix initialization
kuu = np.zeros((t.size, t.size))
#Assign 1. to diagonal terms
kuu[np.diag_indices(t.size)] = 1.
#Upper triangular indices
indtri1, indtri2 = np.triu_indices(t.size, 1)
#Block Diagonal indices among Upper Triangular indices
ind = np.where(index[indtri1] == index[indtri2])
indr = indtri1[ind]
indc = indtri2[ind]
r = t[indr] - t[indc]
r2 = r*r
#Calculation of covariance function
kuu[indr, indc] = np.exp(-r2/lq2[index[indr]])
#Completion of lower triangular part
kuu[indc, indr] = kuu[indr, indc]
return kuu
#Evaluation of cross-covariance function
def _Kfu(self, X, index, X2, index2):
#terms that move along t
t = X[:, 0].reshape(X.shape[0], 1)
d = np.unique(index) #Output Indexes
B = self.decay.values[d]
S = self.W.values[d, :]
#Index transformation
indd = np.arange(self.output_dim)
indd[d] = np.arange(d.size)
index = indd[index]
#Output related variables must be column-wise
B = B.reshape(B.size, 1)
#Input related variables must be row-wise
z = X2[:, 0].reshape(1, X2.shape[0])
lq = self.lengthscale.values.reshape((1, self.rank))
kfu = np.empty((t.size, z.size))
#DxQ terms
c0 = S*((.5*np.sqrt(np.pi))*lq)
nu = B*(.5*lq)
nu2 = nu**2
#1xM terms
z_lq = z/lq[0, index2]
#NxM terms
tz = t-z
tz_lq = tz/lq[0, index2]
# Upsilon Calculations
fullind = np.ix_(index, index2)
upsi = np.exp(nu2[fullind] - B[index]*tz + lnDifErf( -tz_lq + nu[fullind], z_lq+nu[fullind]))
upsi[t[:, 0] == 0, :] = 0.
#Covariance calculation
kfu = c0[fullind]*upsi
return kfu
#Gradient of Kuu wrt lengthscale
def _gkuu_lq(self, X, index):
t = X[:, 0].reshape(X.shape[0],)
index = index.reshape(X.shape[0],)
lq = self.lengthscale.values.reshape(self.rank,)
lq2 = lq*lq
#Covariance matrix initialization
glq = np.zeros((t.size, t.size))
#Upper triangular indices
indtri1, indtri2 = np.triu_indices(t.size, 1)
#Block Diagonal indices among Upper Triangular indices
ind = np.where(index[indtri1] == index[indtri2])
indr = indtri1[ind]
indc = indtri2[ind]
r = t[indr] - t[indc]
r2 = r*r
r2_lq2 = r2/lq2[index[indr]]
#Calculation of covariance function
er2_lq2 = np.exp(-r2_lq2)
#Gradient wrt lq
c = 2.*r2_lq2/lq[index[indr]]
glq[indr, indc] = er2_lq2*c
#Complete the lower triangular
glq[indc, indr] = glq[indr, indc]
return glq
#Be careful this derivative should be transpose it
def _gkuu_X(self, X, index): #Diagonal terms are always zero
t = X[:, 0].reshape(X.shape[0],)
index = index.reshape(index.size,)
lq = self.lengthscale.values.reshape(self.rank,)
lq2 = lq*lq
#Covariance matrix initialization
gt = np.zeros((t.size, t.size))
#Upper triangular indices
indtri1, indtri2 = np.triu_indices(t.size, 1) #Offset of 1 from the diagonal
#Block Diagonal indices among Upper Triangular indices
ind = np.where(index[indtri1] == index[indtri2])
indr = indtri1[ind]
indc = indtri2[ind]
r = t[indr] - t[indc]
r2 = r*r
r2_lq2 = r2/(-lq2[index[indr]])
#Calculation of covariance function
er2_lq2 = np.exp(r2_lq2)
#Gradient wrt t
c = 2.*r/lq2[index[indr]]
gt[indr, indc] = er2_lq2*c
#Complete the lower triangular
gt[indc, indr] = -gt[indr, indc]
return gt
#Gradients for Diagonal Kff
def _gkdiag(self, X, index):
index = index.reshape(index.size,)
#terms that move along t
d = np.unique(index)
B = self.decay[d].values
S = self.W[d, :].values
#Index transformation
indd = np.arange(self.output_dim)
indd[d] = np.arange(d.size)
index = indd[index]
#Output related variables must be column-wise
t = X[:, 0].reshape(X.shape[0], 1)
B = B.reshape(B.size, 1)
S2 = S*S
#Input related variables must be row-wise
lq = self.lengthscale.values.reshape(1, self.rank)
gB = np.empty((t.size,))
glq = np.empty((t.size, lq.size))
gS = np.empty((t.size, lq.size))
#Dx1 terms
c0 = S2*lq*np.sqrt(np.pi)
#DxQ terms
nu = (.5*lq)*B
nu2 = nu*nu
#Nx1 terms
gamt = -B[index]*t
egamt = np.exp(gamt)
e2gamt = egamt*egamt
#NxQ terms
t_lq = t/lq
t2_lq2 = -t_lq*t_lq
etlq2gamt = np.exp(t2_lq2 + gamt) #NXQ
##Upsilon calculations
#erfnu = erf(nu) #TODO: This can be improved
upm = np.exp(nu2[index, :] + lnDifErf( nu[index, :], t_lq + nu[index, :]) )
upm[t[:, 0] == 0, :] = 0.
upv = np.exp(nu2[index, :] + 2.*gamt + lnDifErf(-t_lq + nu[index, :], nu[index, :]) ) #egamt*upv
upv[t[:, 0] == 0, :] = 0.
#Gradient wrt S
c0_S = (S/B)*(lq*np.sqrt(np.pi))
gS = c0_S[index]*(upm - upv)
#For B
CB1 = (.5*lq)**2 - .5/B**2 #DXQ
lq2_2B = (.5*lq**2)*(S2/B) #DXQ
CB2 = 2.*etlq2gamt - e2gamt - 1. #NxQ
# gradient wrt B NxZ
gB = c0[index, :]*(CB1[index, :]*upm - (CB1[index, :] - t/B[index])*upv) + \
lq2_2B[index, :]*CB2
#Gradient wrt lengthscale
#DxQ terms
c0 = (.5*np.sqrt(np.pi))*(S2/B)*(1.+.5*(lq*B)**2)
Clq1 = S2*(lq*.5)
glq = c0[index]*(upm - upv) + Clq1[index]*CB2
return glq, gS, gB
def _gkfu(self, X, index, Z, index2):
index = index.reshape(index.size,)
#TODO: reduce memory usage
#terms that move along t
d = np.unique(index)
B = self.decay[d].values
S = self.W[d, :].values
#Index transformation
indd = np.arange(self.output_dim)
indd[d] = np.arange(d.size)
index = indd[index]
#t column
t = X[:, 0].reshape(X.shape[0], 1)
B = B.reshape(B.size, 1)
#z row
z = Z[:, 0].reshape(1, Z.shape[0])
index2 = index2.reshape(index2.size,)
lq = self.lengthscale.values.reshape((1, self.rank))
#kfu = np.empty((t.size, z.size))
glq = np.empty((t.size, z.size))
gSdq = np.empty((t.size, z.size))
gB = np.empty((t.size, z.size))
#Dx1 terms
B_2 = B*.5
S_pi = S*(.5*np.sqrt(np.pi))
#DxQ terms
c0 = S_pi*lq #lq*Sdq*sqrt(pi)
nu = B*lq*.5
nu2 = nu*nu
#1xM terms
z_lq = z/lq[0, index2]
#NxM terms
tz = t-z
tz_lq = tz/lq[0, index2]
etz_lq2 = -np.exp(-tz_lq*tz_lq)
ez_lq_Bt = np.exp(-z_lq*z_lq -B[index]*t)
# Upsilon calculations
fullind = np.ix_(index, index2)
upsi = np.exp(nu2[fullind] - B[index]*tz + lnDifErf( -tz_lq + nu[fullind], z_lq+nu[fullind] ) )
upsi[t[:, 0] == 0., :] = 0.
#Gradient wrt S
#DxQ term
Sa1 = lq*(.5*np.sqrt(np.pi))
gSdq = Sa1[0,index2]*upsi
#Gradient wrt lq
la1 = S_pi*(1. + 2.*nu2)
Slq = S*lq
uplq = etz_lq2*(tz_lq/lq[0, index2] + B_2[index])
uplq += ez_lq_Bt*(-z_lq/lq[0, index2] + B_2[index])
glq = la1[fullind]*upsi
glq += Slq[fullind]*uplq
#Gradient wrt B
Slq = Slq*lq
nulq = nu*lq
upBd = etz_lq2 + ez_lq_Bt
gB = c0[fullind]*(nulq[fullind] - tz)*upsi + .5*Slq[fullind]*upBd
return glq, gSdq, gB
#TODO: reduce memory usage
def _gkfu_z(self, X, index, Z, index2): #Kfu(t,z)
index = index.reshape(index.size,)
#terms that move along t
d = np.unique(index)
B = self.decay[d].values
S = self.W[d, :].values
#Index transformation
indd = np.arange(self.output_dim)
indd[d] = np.arange(d.size)
index = indd[index]
#t column
t = X[:, 0].reshape(X.shape[0], 1)
B = B.reshape(B.size, 1)
#z row
z = Z[:, 0].reshape(1, Z.shape[0])
index2 = index2.reshape(index2.size,)
lq = self.lengthscale.values.reshape((1, self.rank))
#kfu = np.empty((t.size, z.size))
gz = np.empty((t.size, z.size))
#Dx1 terms
S_pi =S*(.5*np.sqrt(np.pi))
#DxQ terms
#Slq = S*lq
c0 = S_pi*lq #lq*Sdq*sqrt(pi)
nu = (.5*lq)*B
nu2 = nu*nu
#1xM terms
z_lq = z/lq[0, index2]
z_lq2 = -z_lq*z_lq
#NxQ terms
t_lq = t/lq
#NxM terms
zt_lq = z_lq - t_lq[:, index2]
zt_lq2 = -zt_lq*zt_lq
# Upsilon calculations
fullind = np.ix_(index, index2)
z2 = z_lq + nu[fullind]
z1 = z2 - t_lq[:, index2]
upsi = np.exp(nu2[fullind] - B[index]*(t-z) + lnDifErf(z1,z2) )
upsi[t[:, 0] == 0., :] = 0.
#Gradient wrt z
za1 = c0*B
#za2 = S_w
gz = za1[fullind]*upsi + S[fullind]*( np.exp(z_lq2 - B[index]*t) -np.exp(zt_lq2) )
return gz
def lnDifErf(z1,z2):
#Z2 is always positive
logdiferf = np.zeros(z1.shape)
ind = np.where(z1>0.)
ind2 = np.where(z1<=0.)
if ind[0].shape > 0:
z1i = z1[ind]
z12 = z1i*z1i
z2i = z2[ind]
logdiferf[ind] = -z12 + np.log(erfcx(z1i) - erfcx(z2i)*np.exp(z12-z2i**2))
if ind2[0].shape > 0:
z1i = z1[ind2]
z2i = z2[ind2]
logdiferf[ind2] = np.log(erf(z2i) - erf(z1i))
return logdiferf

View file

@ -24,3 +24,5 @@ from .one_vs_all_sparse_classification import OneVsAllSparseClassification
from .dpgplvm import DPBayesianGPLVM from .dpgplvm import DPBayesianGPLVM
from .state_space_model import StateSpace from .state_space_model import StateSpace
from .ibp_lfm import IBPLFM

535
GPy/models/ibp_lfm.py Normal file
View file

@ -0,0 +1,535 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
from ..core.sparse_gp_mpi import SparseGP_MPI
from .. import kern
from ..util.linalg import jitchol, backsub_both_sides, tdot, dtrtrs, dtrtri, pdinv
from ..util import diag
from ..core.parameterization import Param
from ..likelihoods import Gaussian
from ..inference.latent_function_inference.var_dtc_parallel import VarDTC_minibatch
from ..inference.latent_function_inference.posterior import Posterior
from GPy.core.parameterization.variational import VariationalPrior
from ..core.parameterization.parameterized import Parameterized
from paramz.transformations import Logexp, Logistic, __fixed__
log_2_pi = np.log(2*np.pi)
class VarDTC_minibatch_IBPLFM(VarDTC_minibatch):
'''
Modifications of VarDTC_minibatch for IBP LFM
'''
def __init__(self, batchsize=None, limit=3, mpi_comm=None):
super(VarDTC_minibatch_IBPLFM, self).__init__(batchsize, limit, mpi_comm)
def gatherPsiStat(self, kern, X, Z, Y, beta, Zp):
het_noise = beta.size > 1
assert beta.size == 1
trYYT = self.get_trYYT(Y)
if self.Y_speedup and not het_noise:
Y = self.get_YYTfactor(Y)
num_inducing = Z.shape[0]
num_data, output_dim = Y.shape
batchsize = num_data if self.batchsize is None else self.batchsize
psi2_full = np.zeros((num_inducing, num_inducing)) # MxM
psi1Y_full = np.zeros((output_dim, num_inducing)) # DxM
psi0_full = 0.
YRY_full = 0.
for n_start in range(0, num_data, batchsize):
n_end = min(batchsize+n_start, num_data)
if batchsize == num_data:
Y_slice = Y
X_slice = X
else:
Y_slice = Y[n_start:n_end]
X_slice = X[n_start:n_end]
if het_noise:
b = beta[n_start]
YRY_full += np.inner(Y_slice, Y_slice)*b
else:
b = beta
psi0 = kern.Kdiag(X_slice) #Kff^q
psi1 = kern.K(X_slice, Z) #Kfu
indX = X_slice.values
indX = np.int_(np.round(indX[:, -1]))
Zp = Zp.gamma.values
# Extend Zp across columns
indZ = Z.values
indZ = np.int_(np.round(indZ[:, -1])) - Zp.shape[0]
Zpq = Zp[:, indZ]
for d in np.unique(indX):
indd = indX == d
psi1d = psi1[indd, :]
Zpd = Zp[d, :]
Zp2 = Zpd[:, None]*Zpd[None, :] - np.diag(np.power(Zpd, 2)) + np.diag(Zpd)
psi2_full += (np.dot(psi1d.T, psi1d)*Zp2[np.ix_(indZ, indZ)])*b #Zp2*Kufd*Kfud*beta
psi0_full += np.sum(psi0*Zp[indX, :])*b
psi1Y_full += np.dot(Y_slice.T, psi1*Zpq[indX, :])*b
if not het_noise:
YRY_full = trYYT*beta
if self.mpi_comm is not None:
from mpi4py import MPI
psi0_all = np.array(psi0_full)
psi1Y_all = psi1Y_full.copy()
psi2_all = psi2_full.copy()
YRY_all = np.array(YRY_full)
self.mpi_comm.Allreduce([psi0_full, MPI.DOUBLE], [psi0_all, MPI.DOUBLE])
self.mpi_comm.Allreduce([psi1Y_full, MPI.DOUBLE], [psi1Y_all, MPI.DOUBLE])
self.mpi_comm.Allreduce([psi2_full, MPI.DOUBLE], [psi2_all, MPI.DOUBLE])
self.mpi_comm.Allreduce([YRY_full, MPI.DOUBLE], [YRY_all, MPI.DOUBLE])
return psi0_all, psi1Y_all, psi2_all, YRY_all
return psi0_full, psi1Y_full, psi2_full, YRY_full
def inference_likelihood(self, kern, X, Z, likelihood, Y, Zp):
"""
The first phase of inference:
Compute: log-likelihood, dL_dKmm
Cached intermediate results: Kmm, KmmInv,
"""
num_data, output_dim = Y.shape
input_dim = Z.shape[0]
if self.mpi_comm is not None:
from mpi4py import MPI
num_data_all = np.array(num_data,dtype=np.int32)
self.mpi_comm.Allreduce([np.int32(num_data), MPI.INT], [num_data_all, MPI.INT])
num_data = num_data_all
#see whether we've got a different noise variance for each datum
beta = 1./np.fmax(likelihood.variance, 1e-6)
het_noise = beta.size > 1
if het_noise:
self.batchsize = 1
psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, Zp)
#======================================================================
# Compute Common Components
#======================================================================
Kmm = kern.K(Z).copy()
diag.add(Kmm, self.const_jitter)
if not np.isfinite(Kmm).all():
print(Kmm)
Lm = jitchol(Kmm)
LmInv = dtrtri(Lm)
LmInvPsi2LmInvT = np.dot(LmInv, np.dot(psi2_full, LmInv.T))
Lambda = np.eye(Kmm.shape[0])+LmInvPsi2LmInvT
LL = jitchol(Lambda)
LLInv = dtrtri(LL)
logdet_L = 2.*np.sum(np.log(np.diag(LL)))
LmLLInv = np.dot(LLInv, LmInv)
b = np.dot(psi1Y_full, LmLLInv.T)
bbt = np.sum(np.square(b))
v = np.dot(b, LmLLInv).T
LLinvPsi1TYYTPsi1LLinvT = tdot(b.T)
tmp = -np.dot(np.dot(LLInv.T, LLinvPsi1TYYTPsi1LLinvT + output_dim*np.eye(input_dim)), LLInv)
dL_dpsi2R = .5*np.dot(np.dot(LmInv.T, tmp + output_dim*np.eye(input_dim)), LmInv)
# Cache intermediate results
self.midRes['dL_dpsi2R'] = dL_dpsi2R
self.midRes['v'] = v
#======================================================================
# Compute log-likelihood
#======================================================================
if het_noise:
logL_R = -np.sum(np.log(beta))
else:
logL_R = -num_data*np.log(beta)
logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)*.5 - output_dim*logdet_L*.5
#======================================================================
# Compute dL_dKmm
#======================================================================
dL_dKmm = dL_dpsi2R - .5*output_dim*np.dot(np.dot(LmInv.T, LmInvPsi2LmInvT), LmInv)
#======================================================================
# Compute the Posterior distribution of inducing points p(u|Y)
#======================================================================
if not self.Y_speedup or het_noise:
wd_inv = backsub_both_sides(Lm, np.eye(input_dim)- backsub_both_sides(LL, np.identity(input_dim), transpose='left'), transpose='left')
post = Posterior(woodbury_inv=wd_inv, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)
else:
post = None
#======================================================================
# Compute dL_dthetaL for uncertian input and non-heter noise
#======================================================================
if not het_noise:
dL_dthetaL = .5*(YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta) - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
self.midRes['dL_dthetaL'] = dL_dthetaL
return logL, dL_dKmm, post
def inference_minibatch(self, kern, X, Z, likelihood, Y, Zp):
"""
The second phase of inference: Computing the derivatives over a minibatch of Y
Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
return a flag showing whether it reached the end of Y (isEnd)
"""
num_data, output_dim = Y.shape
#see whether we've got a different noise variance for each datum
beta = 1./np.fmax(likelihood.variance, 1e-6)
het_noise = beta.size > 1
# VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
#self.YYTfactor = beta*self.get_YYTfactor(Y)
if self.Y_speedup and not het_noise:
YYT_factor = self.get_YYTfactor(Y)
else:
YYT_factor = Y
n_start = self.batch_pos
batchsize = num_data if self.batchsize is None else self.batchsize
n_end = min(batchsize+n_start, num_data)
if n_end == num_data:
isEnd = True
self.batch_pos = 0
else:
isEnd = False
self.batch_pos = n_end
if batchsize == num_data:
Y_slice = YYT_factor
X_slice = X
else:
Y_slice = YYT_factor[n_start:n_end]
X_slice = X[n_start:n_end]
psi0 = kern.Kdiag(X_slice) #Kffdiag
psi1 = kern.K(X_slice, Z) #Kfu
betapsi1 = np.einsum('n,nm->nm', beta, psi1)
X_slice = X_slice.values
Z = Z.values
Zp = Zp.gamma.values
indX = np.int_(X_slice[:, -1])
indZ = np.int_(Z[:, -1]) - Zp.shape[0]
betaY = beta*Y_slice
#======================================================================
# Load Intermediate Results
#======================================================================
dL_dpsi2R = self.midRes['dL_dpsi2R']
v = self.midRes['v']
#======================================================================
# Compute dL_dpsi
#======================================================================
dL_dpsi0 = -.5*output_dim*(beta * Zp[indX, :]) #XxQ #TODO: Check this gradient
dL_dpsi1 = np.dot(betaY, v.T)
dL_dEZp = psi1*dL_dpsi1
dL_dpsi1 = Zp[np.ix_(indX, indZ)]*dL_dpsi1
dL_dgamma = np.zeros(Zp.shape)
for d in np.unique(indX):
indd = indX == d
betapsi1d = betapsi1[indd, :]
psi1d = psi1[indd, :]
Zpd = Zp[d, :]
Zp2 = Zpd[:, None]*Zpd[None, :] - np.diag(np.power(Zpd, 2)) + np.diag(Zpd)
dL_dpsi1[indd, :] += np.dot(betapsi1d, Zp2[np.ix_(indZ, indZ)] * dL_dpsi2R)*2.
dL_EZp2 = dL_dpsi2R * (np.dot(psi1d.T, psi1d) * beta)*2. # Zpd*Kufd*Kfud*beta
#Gradient of Likelihood wrt gamma is calculated here
EZ = Zp[d, indZ]
for q in range(Zp.shape[1]):
EZt = EZ.copy()
indq = indZ == q
EZt[indq] = .5
dL_dgamma[d, q] = np.sum(dL_dEZp[np.ix_(indd, indq)]) + np.sum(dL_EZp2[:, indq]*EZt[:, None]) -\
.5*beta*(np.sum(psi0[indd, q]))
#======================================================================
# Compute dL_dthetaL
#======================================================================
if isEnd:
dL_dthetaL = self.midRes['dL_dthetaL']
else:
dL_dthetaL = 0.
grad_dict = {'dL_dKdiag': dL_dpsi0,
'dL_dKnm': dL_dpsi1,
'dL_dthetaL': dL_dthetaL,
'dL_dgamma': dL_dgamma}
return isEnd, (n_start, n_end), grad_dict
def update_gradients(model, mpi_comm=None):
if mpi_comm is None:
Y = model.Y
X = model.X
else:
Y = model.Y_local
X = model.X[model.N_range[0]:model.N_range[1]]
model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y, model.Zp)
het_noise = model.likelihood.variance.size > 1
if het_noise:
dL_dthetaL = np.empty((model.Y.shape[0],))
else:
dL_dthetaL = np.float64(0.)
kern_grad = model.kern.gradient.copy()
kern_grad[:] = 0.
model.Z.gradient = 0.
gamma_gradient = model.Zp.gamma.copy()
gamma_gradient[:] = 0.
isEnd = False
while not isEnd:
isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y, model.Zp)
if (n_range[1]-n_range[0]) == X.shape[0]:
X_slice = X
elif mpi_comm is None:
X_slice = model.X[n_range[0]:n_range[1]]
else:
X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]]
#gradients w.r.t. kernel
model.kern.update_gradients_diag(grad_dict['dL_dKdiag'], X_slice)
kern_grad += model.kern.gradient
model.kern.update_gradients_full(grad_dict['dL_dKnm'], X_slice, model.Z)
kern_grad += model.kern.gradient
#gradients w.r.t. Z
model.Z.gradient += model.kern.gradients_X(grad_dict['dL_dKnm'].T, model.Z, X_slice)
#gradients w.r.t. posterior parameters of Zp
gamma_gradient += grad_dict['dL_dgamma']
if het_noise:
dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
else:
dL_dthetaL += grad_dict['dL_dthetaL']
# Gather the gradients from multiple MPI nodes
if mpi_comm is not None:
from mpi4py import MPI
if het_noise:
raise "het_noise not implemented!"
kern_grad_all = kern_grad.copy()
Z_grad_all = model.Z.gradient.copy()
gamma_grad_all = gamma_gradient.copy()
mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE])
mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE])
mpi_comm.Allreduce([gamma_gradient, MPI.DOUBLE], [gamma_grad_all, MPI.DOUBLE])
kern_grad = kern_grad_all
model.Z.gradient = Z_grad_all
gamma_gradient = gamma_grad_all
#gradients w.r.t. kernel
model.kern.update_gradients_full(dL_dKmm, model.Z, None)
model.kern.gradient += kern_grad
#gradients w.r.t. Z
model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z)
#gradient w.r.t. gamma
model.Zp.gamma.gradient = gamma_gradient
# Update Log-likelihood
KL_div = model.variational_prior.KL_divergence(model.Zp)
# update for the KL divergence
model.variational_prior.update_gradients_KL(model.Zp)
model._log_marginal_likelihood += KL_div
# dL_dthetaL
model.likelihood.update_gradients(dL_dthetaL)
class IBPPosterior(Parameterized):
'''
The IBP distribution for variational approximations.
'''
def __init__(self, binary_prob, tau=None, name='Sensitivity space', *a, **kw):
"""
binary_prob : the probability of including a latent function over an output.
"""
super(IBPPosterior, self).__init__(name=name, *a, **kw)
self.gamma = Param("binary_prob", binary_prob, Logistic(1e-10, 1. - 1e-10))
self.link_parameter(self.gamma)
if tau is not None:
assert tau.size == 2*self.gamma_.shape[1]
self.tau = Param("tau", tau, Logexp())
else:
self.tau = Param("tau", np.ones((2, self.gamma.shape[1])), Logexp())
self.link_parameter(self.tau)
def set_gradients(self, grad):
self.gamma.gradient, self.tau.gradient = grad
def __getitem__(self, s):
pass
# if isinstance(s, (int, slice, tuple, list, np.ndarray)):
# import copy
# n = self.__new__(self.__class__, self.name)
# dc = self.__dict__.copy()
# dc['binary_prob'] = self.binary_prob[s]
# dc['tau'] = self.tau
# dc['parameters'] = copy.copy(self.parameters)
# n.__dict__.update(dc)
# n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
# n.parameters[dc['tau']._parent_index_] = dc['tau']
# n._gradient_array_ = None
# oversize = self.size - self.gamma.size - self.tau.size
# n.size = n.gamma.size + n.tau.size + oversize
# return n
# else:
# return super(IBPPosterior, self).__getitem__(s)
class IBPPrior(VariationalPrior):
def __init__(self, rank, alpha=2., name='IBPPrior', **kw):
super(IBPPrior, self).__init__(name=name, **kw)
from paramz.transformations import __fixed__
self.rank = rank
self.alpha = Param('alpha', alpha, __fixed__)
self.link_parameter(self.alpha)
def KL_divergence(self, variational_posterior):
from scipy.special import gamma, psi
eta, tau = variational_posterior.gamma.values, variational_posterior.tau.values
sum_eta = np.sum(eta, axis=0) #sum_d gamma(d,q)
D_seta = eta.shape[0] - sum_eta
ad = self.alpha/eta.shape[1]
psitau1 = psi(tau[0, :])
psitau2 = psi(tau[1, :])
sumtau = np.sum(tau, axis=0)
psitau = psi(sumtau)
# E[log p(z)]
part1 = np.sum(sum_eta*psitau1 + D_seta*psitau2 - eta.shape[0]*psitau)
# E[log p(pi)]
part1 += (ad - 1.)*np.sum(psitau1 - psitau) + eta.shape[1]*np.log(ad)
#H(z)
part2 = np.sum(-(1.-eta)*np.log(1.-eta) - eta*np.log(eta))
#H(pi)
part2 += np.sum(np.log(gamma(tau[0, :])*gamma(tau[1, :])/gamma(sumtau))-(tau[0, :]-1.)*psitau1-(tau[1, :]-1.)*psitau2\
+ (sumtau-2.)*psitau)
return part1+part2
def update_gradients_KL(self, variational_posterior):
eta, tau = variational_posterior.gamma.values, variational_posterior.tau.values
from scipy.special import psi, polygamma
dgamma = np.log(1. - eta) - np.log(eta) + psi(tau[0, :]) - psi(tau[1, :])
variational_posterior.gamma.gradient += dgamma
ad = self.alpha/self.rank
sumeta = np.sum(eta, axis=0)
sumtau = np.sum(tau, axis=0)
common = (-eta.shape[0] - (ad - 1.) + (sumtau - 2.))*polygamma(1, sumtau)
variational_posterior.tau.gradient[0, :] = (sumeta + ad - tau[0, :])*polygamma(1, tau[0, :]) + common
variational_posterior.tau.gradient[1, :] = ((eta.shape[0] - sumeta) - (tau[1, :] - 1.))*polygamma(1, tau[1, :])\
+ common
class IBPLFM(SparseGP_MPI):
"""
Indian Buffet Process for Latent Force Models
:param Y: observed data (np.ndarray) or GPy.likelihood
:type Y: np.ndarray| GPy.likelihood instance
:param X: input data (np.ndarray) [X:values, X:index], index refers to the number of the output
:type X: np.ndarray
:param input_dim: latent dimensionality
:type input_dim: int
: param rank: number of latent functions
"""
def __init__(self, X, Y, input_dim=2, output_dim=1, rank=1, Gamma=None, num_inducing=10,
Z=None, kernel=None, inference_method=None, likelihood=None, name='IBP for LFM', alpha=2., beta=2., connM=None, tau=None, mpi_comm=None, normalizer=False, variational_prior=None,**kwargs):
if kernel is None:
kernel = kern.EQ_ODE2(input_dim, output_dim, rank)
if Gamma is None:
gamma = np.empty((output_dim, rank)) # The posterior probabilities of the binary variable in the variational approximation
gamma[:] = 0.5 + 0.1 * np.random.randn(output_dim, rank)
gamma[gamma>1.-1e-9] = 1.-1e-9
gamma[gamma<1e-9] = 1e-9
else:
gamma = Gamma.copy()
#TODO: create a vector of inducing points
if Z is None:
Z = np.random.permutation(X.copy())[:num_inducing]
assert Z.shape[1] == X.shape[1]
if likelihood is None:
likelihood = Gaussian()
if inference_method is None:
inference_method = VarDTC_minibatch_IBPLFM(mpi_comm=mpi_comm)
#Definition of variational terms
self.variational_prior = IBPPrior(rank=rank, alpha=alpha) if variational_prior is None else variational_prior
self.Zp = IBPPosterior(gamma, tau=tau)
super(IBPLFM, self).__init__(X, Y, Z, kernel, likelihood, variational_prior=self.variational_prior, inference_method=inference_method, name=name, mpi_comm=mpi_comm, normalizer=normalizer, **kwargs)
self.link_parameter(self.Zp, index=0)
def set_Zp_gradients(self, Zp, Zp_grad):
"""Set the gradients of the posterior distribution of Zp in its specific form."""
Zp.gamma.gradient = Zp_grad
def get_Zp_gradients(self, Zp):
"""Get the gradients of the posterior distribution of Zp in its specific form."""
return Zp.gamma.gradient
def _propogate_Zp_val(self):
pass
def parameters_changed(self):
#super(IBPLFM,self).parameters_changed()
if isinstance(self.inference_method, VarDTC_minibatch_IBPLFM):
update_gradients(self, mpi_comm=self.mpi_comm)
return
# Add the KL divergence term
self._log_marginal_likelihood += self.variational_prior.KL_divergence(self.Zp)
#TODO Change the following according to this variational distribution
#self.Zp.gamma.gradient = self.
# update for the KL divergence
self.variational_prior.update_gradients_KL(self.Zp)