mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-06-11 15:15:15 +02:00
[missing data] general implementation for subsetting data
This commit is contained in:
parent
d93fac8c13
commit
fa7807ee6f
7 changed files with 329 additions and 108 deletions
|
|
@ -9,6 +9,7 @@ from .. import likelihoods
|
|||
from parameterization.variational import VariationalPosterior
|
||||
|
||||
import logging
|
||||
from GPy.inference.latent_function_inference.posterior import Posterior
|
||||
logger = logging.getLogger("sparse gp")
|
||||
|
||||
class SparseGP(GP):
|
||||
|
|
@ -34,12 +35,18 @@ class SparseGP(GP):
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, name='sparse gp', Y_metadata=None, normalizer=False):
|
||||
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
|
||||
name='sparse gp', Y_metadata=None, normalizer=False,
|
||||
missing_data=False):
|
||||
|
||||
self.missing_data = missing_data
|
||||
if self.missing_data:
|
||||
self.ninan = ~np.isnan(Y)
|
||||
|
||||
#pick a sensible inference method
|
||||
if inference_method is None:
|
||||
if isinstance(likelihood, likelihoods.Gaussian):
|
||||
inference_method = var_dtc.VarDTC()
|
||||
inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
|
||||
else:
|
||||
#inference_method = ??
|
||||
raise NotImplementedError, "what to do what to do?"
|
||||
|
|
@ -51,44 +58,200 @@ class SparseGP(GP):
|
|||
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
|
||||
logger.info("Adding Z as parameter")
|
||||
self.link_parameter(self.Z, index=0)
|
||||
if self.missing_data:
|
||||
self.Ylist = []
|
||||
overall = self.Y_normalized.shape[1]
|
||||
m_f = lambda i: "Precomputing Y for missing data: {: >7.2%}".format(float(i+1)/overall)
|
||||
message = m_f(-1)
|
||||
print message,
|
||||
for d in xrange(overall):
|
||||
self.Ylist.append(self.Y_normalized[self.ninan[:, d], d][:, None])
|
||||
print ' '*(len(message)+1) + '\r',
|
||||
message = m_f(d)
|
||||
print message,
|
||||
print ''
|
||||
|
||||
|
||||
def has_uncertain_inputs(self):
|
||||
return isinstance(self.X, VariationalPosterior)
|
||||
|
||||
def parameters_changed(self):
|
||||
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
|
||||
self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
|
||||
if isinstance(self.X, VariationalPosterior):
|
||||
def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
|
||||
"""
|
||||
This is the standard part, which usually belongs in parameters_changed.
|
||||
|
||||
For automatic handling of subsampling (such as missing_data, stochastics etc.), we need to put this into an inner
|
||||
loop, in order to ensure a different handling of gradients etc of different
|
||||
subsets of data.
|
||||
|
||||
The dict in current_values will be passed aroung as current_values for
|
||||
the rest of the algorithm, so this is the place to store current values,
|
||||
such as subsets etc, if necessary.
|
||||
|
||||
If Lm and dL_dKmm can be precomputed (or only need to be computed once)
|
||||
pass them in here, so they will be passed to the inference_method.
|
||||
|
||||
subset_indices is a dictionary of indices. you can put the indices however you
|
||||
like them into this dictionary for inner use of the indices inside the
|
||||
algorithm.
|
||||
"""
|
||||
posterior, log_marginal_likelihood, grad_dict = self.inference_method.inference(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=None)
|
||||
current_values = {}
|
||||
likelihood.update_gradients(grad_dict['dL_dthetaL'])
|
||||
current_values['likgrad'] = likelihood.gradient.copy()
|
||||
if subset_indices is None:
|
||||
subset_indices = {}
|
||||
if isinstance(X, VariationalPosterior):
|
||||
#gradients wrt kernel
|
||||
dL_dKmm = self.grad_dict['dL_dKmm']
|
||||
self.kern.update_gradients_full(dL_dKmm, self.Z, None)
|
||||
target = self.kern.gradient.copy()
|
||||
self.kern.update_gradients_expectations(variational_posterior=self.X,
|
||||
Z=self.Z,
|
||||
dL_dpsi0=self.grad_dict['dL_dpsi0'],
|
||||
dL_dpsi1=self.grad_dict['dL_dpsi1'],
|
||||
dL_dpsi2=self.grad_dict['dL_dpsi2'])
|
||||
self.kern.gradient += target
|
||||
dL_dKmm = grad_dict['dL_dKmm']
|
||||
kern.update_gradients_full(dL_dKmm, Z, None)
|
||||
current_values['kerngrad'] = kern.gradient.copy()
|
||||
kern.update_gradients_expectations(variational_posterior=X,
|
||||
Z=Z,
|
||||
dL_dpsi0=grad_dict['dL_dpsi0'],
|
||||
dL_dpsi1=grad_dict['dL_dpsi1'],
|
||||
dL_dpsi2=grad_dict['dL_dpsi2'])
|
||||
current_values['kerngrad'] += kern.gradient
|
||||
|
||||
#gradients wrt Z
|
||||
self.Z.gradient = self.kern.gradients_X(dL_dKmm, self.Z)
|
||||
self.Z.gradient += self.kern.gradients_Z_expectations(
|
||||
self.grad_dict['dL_dpsi0'],
|
||||
self.grad_dict['dL_dpsi1'],
|
||||
self.grad_dict['dL_dpsi2'],
|
||||
Z=self.Z,
|
||||
variational_posterior=self.X)
|
||||
current_values['Zgrad'] = kern.gradients_X(dL_dKmm, Z)
|
||||
current_values['Zgrad'] += kern.gradients_Z_expectations(
|
||||
grad_dict['dL_dpsi0'],
|
||||
grad_dict['dL_dpsi1'],
|
||||
grad_dict['dL_dpsi2'],
|
||||
Z=Z,
|
||||
variational_posterior=X)
|
||||
else:
|
||||
#gradients wrt kernel
|
||||
self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
|
||||
target = self.kern.gradient.copy()
|
||||
self.kern.update_gradients_full(self.grad_dict['dL_dKnm'], self.X, self.Z)
|
||||
target += self.kern.gradient
|
||||
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z, None)
|
||||
self.kern.gradient += target
|
||||
kern.update_gradients_diag(grad_dict['dL_dKdiag'], X)
|
||||
current_values['kerngrad'] = kern.gradient.copy()
|
||||
kern.update_gradients_full(grad_dict['dL_dKnm'], X, Z)
|
||||
current_values['kerngrad'] += kern.gradient
|
||||
kern.update_gradients_full(grad_dict['dL_dKmm'], Z, None)
|
||||
current_values['kerngrad'] += kern.gradient
|
||||
#gradients wrt Z
|
||||
self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z)
|
||||
self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X)
|
||||
current_values['Zgrad'] = kern.gradients_X(grad_dict['dL_dKmm'], Z)
|
||||
current_values['Zgrad'] += kern.gradients_X(grad_dict['dL_dKnm'].T, Z, X)
|
||||
return posterior, log_marginal_likelihood, grad_dict, current_values, subset_indices
|
||||
|
||||
def _inner_take_over_or_update(self, full_values=None, current_values=None, value_indices=None):
|
||||
"""
|
||||
This is for automatic updates of values in the inner loop of missing
|
||||
data handling. Both arguments are dictionaries and the values in
|
||||
full_values will be updated by the current_gradients.
|
||||
|
||||
If a key from current_values does not exist in full_values, it will be
|
||||
initialized to the value in current_values.
|
||||
|
||||
If there is indices needed for the update, value_indices can be used for
|
||||
that. If value_indices has the same key, as current_values, the update
|
||||
in full_values will be indexed by the indices in value_indices.
|
||||
|
||||
grads:
|
||||
dictionary of standing gradients (you will have to carefully make sure, that
|
||||
the ordering is right!). The values in here will be updated such that
|
||||
full_values[key] += current_values[key] forall key in full_gradients.keys()
|
||||
|
||||
gradients:
|
||||
dictionary of gradients in the current set of parameters.
|
||||
|
||||
value_indices:
|
||||
dictionary holding indices for the update in full_values.
|
||||
if the key exists the update rule is:
|
||||
full_values[key][value_indices[key]] += current_values[key]
|
||||
"""
|
||||
for key in current_values.keys():
|
||||
if value_indices is not None and value_indices.has_key(key):
|
||||
index = value_indices[key]
|
||||
else:
|
||||
index = slice(None)
|
||||
if full_values.has_key(key):
|
||||
full_values[key][index] += current_values[key]
|
||||
else:
|
||||
full_values[key] = current_values[key]
|
||||
|
||||
def _inner_values_update(self, current_values):
|
||||
"""
|
||||
This exists if there is more to do with the current values.
|
||||
It will be called allways in the inner loop, so that
|
||||
you can do additional inner updates for the inside of the missing data
|
||||
loop etc. This can also be used for stochastic updates, when only working on
|
||||
one dimension of the output.
|
||||
"""
|
||||
pass
|
||||
|
||||
def _outer_values_update(self, full_values):
|
||||
"""
|
||||
Here you put the values, which were collected before in the right places.
|
||||
E.g. set the gradients of parameters, etc.
|
||||
"""
|
||||
self.likelihood.gradient = full_values['likgrad']
|
||||
self.kern.gradient = full_values['kerngrad']
|
||||
self.Z.gradient = full_values['Zgrad']
|
||||
|
||||
def _outer_init_full_values(self):
|
||||
"""
|
||||
If full_values has indices in values_indices, we might want to initialize
|
||||
the full_values differently, so that subsetting is possible.
|
||||
|
||||
Here you can initialize the full_values for the values needed.
|
||||
|
||||
Keep in mind, that if a key does not exist in full_values when updating
|
||||
values, it will be set (so e.g. for Z there is no need to initialize Zgrad,
|
||||
as there is no subsetting needed. For X in BGPLVM on the other hand we probably need
|
||||
to initialize the gradients for the mean and the variance in order to
|
||||
have the full gradient for indexing)
|
||||
"""
|
||||
return {}
|
||||
|
||||
def _outer_loop_for_missing_data(self):
|
||||
Lm = None
|
||||
dL_dKmm = None
|
||||
Kmm = None
|
||||
|
||||
self._log_marginal_likelihood = 0
|
||||
full_values = self._outer_init_full_values()
|
||||
|
||||
woodbury_inv = np.zeros((self.num_inducing, self.num_inducing, self.output_dim))
|
||||
woodbury_vector = np.zeros((self.num_inducing, self.output_dim))
|
||||
m_f = lambda i: "Inference with missing data: {: >7.2%}".format(float(i+1)/self.output_dim)
|
||||
message = m_f(-1)
|
||||
print message,
|
||||
for d in xrange(self.output_dim):
|
||||
ninan = self.ninan[:, d]
|
||||
print ' '*(len(message)) + '\r',
|
||||
message = m_f(d)
|
||||
print message,
|
||||
|
||||
posterior, log_marginal_likelihood, \
|
||||
grad_dict, current_values, value_indices = self._inner_parameters_changed(
|
||||
self.kern, self.X[ninan],
|
||||
self.Z, self.likelihood,
|
||||
self.Ylist[d], self.Y_metadata,
|
||||
Lm, dL_dKmm,
|
||||
subset_indices=dict(outputs=d, samples=ninan))
|
||||
|
||||
self._inner_take_over_or_update(full_values, current_values, value_indices)
|
||||
self._inner_values_update(current_values)
|
||||
|
||||
Lm = posterior.K_chol
|
||||
dL_dKmm = grad_dict['dL_dKmm']
|
||||
Kmm = posterior._K
|
||||
woodbury_inv[:, :, d] = posterior.woodbury_inv
|
||||
woodbury_vector[:, d:d+1] = posterior.woodbury_vector
|
||||
self._log_marginal_likelihood += log_marginal_likelihood
|
||||
print ''
|
||||
|
||||
self.posterior = Posterior(woodbury_inv=woodbury_inv, woodbury_vector=woodbury_vector,
|
||||
K=Kmm, mean=None, cov=None, K_chol=Lm)
|
||||
self._outer_values_update(full_values)
|
||||
|
||||
def parameters_changed(self):
|
||||
if self.missing_data:
|
||||
self._outer_loop_for_missing_data()
|
||||
else:
|
||||
self.posterior, self._log_marginal_likelihood, self.grad_dict, gradients, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
|
||||
self.kern.gradient = gradients['kerngrad']
|
||||
self.Z.gradient = gradients['Zgrad']
|
||||
|
||||
def _raw_predict(self, Xnew, full_cov=False, kern=None):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -34,15 +34,15 @@ class SparseGP_MPI(SparseGP):
|
|||
|
||||
"""
|
||||
|
||||
def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False):
|
||||
def __init__(self, X, Y, Z, kernel, likelihood, variational_prior=None, inference_method=None, name='sparse gp mpi', Y_metadata=None, mpi_comm=None, normalizer=False, missing_data=False):
|
||||
self._IN_OPTIMIZATION_ = False
|
||||
if mpi_comm != None:
|
||||
if inference_method is None:
|
||||
inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
|
||||
else:
|
||||
assert isinstance(inference_method, VarDTC_minibatch), 'inference_method has to support MPI!'
|
||||
|
||||
super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
|
||||
|
||||
super(SparseGP_MPI, self).__init__(X, Y, Z, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer, missing_data=missing_data)
|
||||
self.update_model(False)
|
||||
self.link_parameter(self.X, index=0)
|
||||
if variational_prior is not None:
|
||||
|
|
@ -75,18 +75,18 @@ class SparseGP_MPI(SparseGP):
|
|||
return dc
|
||||
|
||||
#=====================================================
|
||||
# The MPI parallelization
|
||||
# The MPI parallelization
|
||||
# - can move to model at some point
|
||||
#=====================================================
|
||||
|
||||
|
||||
@SparseGP.optimizer_array.setter
|
||||
def optimizer_array(self, p):
|
||||
if self.mpi_comm != None:
|
||||
if self._IN_OPTIMIZATION_ and self.mpi_comm.rank==0:
|
||||
self.mpi_comm.Bcast(np.int32(1),root=0)
|
||||
self.mpi_comm.Bcast(p, root=0)
|
||||
self.mpi_comm.Bcast(p, root=0)
|
||||
SparseGP.optimizer_array.fset(self,p)
|
||||
|
||||
|
||||
def optimize(self, optimizer=None, start=None, **kwargs):
|
||||
self._IN_OPTIMIZATION_ = True
|
||||
if self.mpi_comm==None:
|
||||
|
|
|
|||
|
|
@ -367,14 +367,14 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1,
|
|||
Y = Ylist[0]
|
||||
k = kern.Linear(Q, ARD=True)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
|
||||
|
||||
inan = _np.random.binomial(1, .8, size=Y.shape).astype(bool) # 80% missing data
|
||||
inan = _np.random.binomial(1, .2, size=Y.shape).astype(bool) # 80% missing data
|
||||
Ymissing = Y.copy()
|
||||
Ymissing[inan] = _np.nan
|
||||
|
||||
m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
|
||||
inference_method=VarDTCMissingData(inan=inan), kernel=k)
|
||||
kernel=k, missing_data=True)
|
||||
|
||||
m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
|
||||
m.X.variance[:] = _np.random.uniform(0,.1,m.X.shape)
|
||||
m.likelihood.variance = .01
|
||||
m.parameters_changed()
|
||||
m.Yreal = Y
|
||||
|
|
|
|||
|
|
@ -62,7 +62,9 @@ class VarDTC(LatentFunctionInference):
|
|||
def get_VVTfactor(self, Y, prec):
|
||||
return Y * prec # TODO chache this, and make it effective
|
||||
|
||||
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
|
||||
|
||||
|
||||
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None, dL_dKmm=None):
|
||||
|
||||
_, output_dim = Y.shape
|
||||
uncertain_inputs = isinstance(X, VariationalPosterior)
|
||||
|
|
@ -84,8 +86,8 @@ class VarDTC(LatentFunctionInference):
|
|||
|
||||
Kmm = kern.K(Z).copy()
|
||||
diag.add(Kmm, self.const_jitter)
|
||||
Lm = jitchol(Kmm)
|
||||
|
||||
if Lm is None:
|
||||
Lm = jitchol(Kmm)
|
||||
|
||||
# The rather complex computations of A, and the psi stats
|
||||
if uncertain_inputs:
|
||||
|
|
@ -100,7 +102,6 @@ class VarDTC(LatentFunctionInference):
|
|||
else:
|
||||
psi0 = kern.Kdiag(X)
|
||||
psi1 = kern.K(X, Z)
|
||||
psi2 = None
|
||||
if het_noise:
|
||||
tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
|
||||
else:
|
||||
|
|
@ -122,11 +123,12 @@ class VarDTC(LatentFunctionInference):
|
|||
delit = tdot(_LBi_Lmi_psi1Vf)
|
||||
data_fit = np.trace(delit)
|
||||
DBi_plus_BiPBi = backsub_both_sides(LB, output_dim * np.eye(num_inducing) + delit)
|
||||
delit = -0.5 * DBi_plus_BiPBi
|
||||
delit += -0.5 * B * output_dim
|
||||
delit += output_dim * np.eye(num_inducing)
|
||||
# Compute dL_dKmm
|
||||
dL_dKmm = backsub_both_sides(Lm, delit)
|
||||
if dL_dKmm is None:
|
||||
delit = -0.5 * DBi_plus_BiPBi
|
||||
delit += -0.5 * B * output_dim
|
||||
delit += output_dim * np.eye(num_inducing)
|
||||
# Compute dL_dKmm
|
||||
dL_dKmm = backsub_both_sides(Lm, delit)
|
||||
|
||||
# derivatives of L w.r.t. psi
|
||||
dL_dpsi0, dL_dpsi1, dL_dpsi2 = _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm,
|
||||
|
|
@ -208,6 +210,7 @@ class VarDTCMissingData(LatentFunctionInference):
|
|||
inan = np.isnan(Y)
|
||||
has_none = inan.any()
|
||||
self._inan = ~inan
|
||||
inan = self._inan
|
||||
else:
|
||||
inan = self._inan
|
||||
has_none = True
|
||||
|
|
@ -241,7 +244,7 @@ class VarDTCMissingData(LatentFunctionInference):
|
|||
self._subarray_indices = [[slice(None),slice(None)]]
|
||||
return [Y], [(Y**2).sum()]
|
||||
|
||||
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
|
||||
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None, Lm=None):
|
||||
if isinstance(X, VariationalPosterior):
|
||||
uncertain_inputs = True
|
||||
psi0_all = kern.psi0(Z, X)
|
||||
|
|
@ -260,7 +263,7 @@ class VarDTCMissingData(LatentFunctionInference):
|
|||
dL_dpsi0_all = np.zeros(Y.shape[0])
|
||||
dL_dpsi1_all = np.zeros((Y.shape[0], num_inducing))
|
||||
if uncertain_inputs:
|
||||
dL_dpsi2_all = np.zeros((num_inducing, num_inducing))
|
||||
dL_dpsi2_all = np.zeros((Y.shape[0], num_inducing, num_inducing))
|
||||
|
||||
dL_dR = 0
|
||||
woodbury_vector = np.zeros((num_inducing, Y.shape[1]))
|
||||
|
|
@ -271,31 +274,31 @@ class VarDTCMissingData(LatentFunctionInference):
|
|||
Kmm = kern.K(Z).copy()
|
||||
diag.add(Kmm, self.const_jitter)
|
||||
#factor Kmm
|
||||
Lm = jitchol(Kmm)
|
||||
if Lm is None:
|
||||
Lm = jitchol(Kmm)
|
||||
if uncertain_inputs: LmInv = dtrtri(Lm)
|
||||
|
||||
size = Y.shape[1]
|
||||
size = len(Ys)
|
||||
next_ten = 0
|
||||
|
||||
for i, [y, v, trYYT] in enumerate(itertools.izip(Ys, self._inan.T, traces)):
|
||||
if ((i+1.)/size) >= next_ten:
|
||||
logger.info('inference {:> 6.1%}'.format((i+1.)/size))
|
||||
next_ten += .1
|
||||
if het_noise: beta = beta_all[i]
|
||||
else: beta = beta_all
|
||||
|
||||
VVT_factor = (y*beta)
|
||||
output_dim = 1#len(ind)
|
||||
|
||||
psi0 = psi0_all[v]
|
||||
psi1 = psi1_all[v, :]
|
||||
if uncertain_inputs: psi2 = kern.psi2(Z, X[v, :])
|
||||
if uncertain_inputs:
|
||||
psi2 = kern.psi2(Z, X[v, :])
|
||||
else: psi2 = None
|
||||
num_data = psi1.shape[0]
|
||||
|
||||
if uncertain_inputs:
|
||||
if het_noise: psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
|
||||
else: psi2_beta = psi2 * beta
|
||||
else: psi2_beta = psi2 * (beta)
|
||||
A = LmInv.dot(psi2_beta.dot(LmInv.T))
|
||||
else:
|
||||
if het_noise: tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
|
||||
|
|
@ -330,11 +333,11 @@ class VarDTCMissingData(LatentFunctionInference):
|
|||
dL_dpsi0_all[v] += dL_dpsi0
|
||||
dL_dpsi1_all[v, :] += dL_dpsi1
|
||||
if uncertain_inputs:
|
||||
dL_dpsi2_all += dL_dpsi2
|
||||
dL_dpsi2_all[v, :, :] += dL_dpsi2
|
||||
|
||||
# log marginal likelihood
|
||||
log_marginal += _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
|
||||
psi0, A, LB, trYYT, data_fit,VVT_factor)
|
||||
psi0, A, LB, trYYT, data_fit, VVT_factor)
|
||||
|
||||
#put the gradients in the right places
|
||||
dL_dR += _compute_dL_dR(likelihood,
|
||||
|
|
@ -393,7 +396,6 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C
|
|||
# subsume back into psi1 (==Kmn)
|
||||
dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
|
||||
dL_dpsi2 = None
|
||||
|
||||
return dL_dpsi0, dL_dpsi1, dL_dpsi2
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ def psicomputations(variance, lengthscale, Z, variational_posterior):
|
|||
# _psi1 NxM
|
||||
mu = variational_posterior.mean
|
||||
S = variational_posterior.variance
|
||||
|
||||
|
||||
psi0 = np.empty(mu.shape[0])
|
||||
psi0[:] = variance
|
||||
psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
|
||||
|
|
@ -41,7 +41,7 @@ def __psi1computations(variance, lengthscale, Z, mu, S):
|
|||
_psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N
|
||||
_psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.)
|
||||
_psi1 = variance*np.exp(_psi1_log)
|
||||
|
||||
|
||||
return _psi1
|
||||
|
||||
def __psi2computations(variance, lengthscale, Z, mu, S):
|
||||
|
|
@ -54,27 +54,27 @@ def __psi2computations(variance, lengthscale, Z, mu, S):
|
|||
# here are the "statistics" for psi2
|
||||
# Produced intermediate results:
|
||||
# _psi2 MxM
|
||||
|
||||
|
||||
lengthscale2 = np.square(lengthscale)
|
||||
|
||||
|
||||
_psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
|
||||
_psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
|
||||
Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
|
||||
denom = 1./(2.*S+lengthscale2)
|
||||
_psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
|
||||
_psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
|
||||
|
||||
|
||||
|
||||
return _psi2
|
||||
|
||||
def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
||||
ARD = (len(lengthscale)!=1)
|
||||
|
||||
|
||||
dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
|
||||
dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
|
||||
|
||||
dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
|
||||
|
||||
|
||||
dL_dlengscale = dl_psi1 + dl_psi2
|
||||
if not ARD:
|
||||
dL_dlengscale = dL_dlengscale.sum()
|
||||
|
|
@ -82,7 +82,7 @@ def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscal
|
|||
dL_dmu = dmu_psi1 + dmu_psi2
|
||||
dL_dS = dS_psi1 + dS_psi2
|
||||
dL_dZ = dZ_psi1 + dZ_psi2
|
||||
|
||||
|
||||
return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
|
||||
|
||||
def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
|
||||
|
|
@ -101,9 +101,9 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
|
|||
# _dL_dgamma NxQ
|
||||
# _dL_dmu NxQ
|
||||
# _dL_dS NxQ
|
||||
|
||||
|
||||
lengthscale2 = np.square(lengthscale)
|
||||
|
||||
|
||||
_psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
|
||||
Lpsi1 = dL_dpsi1*_psi1
|
||||
Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
|
||||
|
|
@ -114,7 +114,7 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
|
|||
_dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2.
|
||||
_dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom)
|
||||
_dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale)
|
||||
|
||||
|
||||
return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
|
||||
|
||||
def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
|
||||
|
|
@ -133,13 +133,12 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
|
|||
# _dL_dgamma NxQ
|
||||
# _dL_dmu NxQ
|
||||
# _dL_dS NxQ
|
||||
|
||||
|
||||
lengthscale2 = np.square(lengthscale)
|
||||
denom = 1./(2*S+lengthscale2)
|
||||
denom2 = np.square(denom)
|
||||
|
||||
_psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
|
||||
|
||||
Lpsi2 = dL_dpsi2[None,:,:]*_psi2
|
||||
Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
|
||||
Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
|
||||
|
|
@ -147,7 +146,7 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
|
|||
Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ
|
||||
Lpsi2Zhat = Lpsi2Z
|
||||
Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2
|
||||
|
||||
|
||||
_dL_dvar = Lpsi2sum.sum()*2/variance
|
||||
_dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat)
|
||||
_dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None]
|
||||
|
|
@ -157,6 +156,6 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
|
|||
(2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0)
|
||||
|
||||
return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
|
||||
|
||||
|
||||
_psi1computations = Cacher(__psi1computations, limit=1)
|
||||
_psi2computations = Cacher(__psi2computations, limit=1)
|
||||
|
|
|
|||
|
|
@ -25,7 +25,9 @@ class BayesianGPLVM(SparseGP_MPI):
|
|||
|
||||
"""
|
||||
def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
|
||||
Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', mpi_comm=None, normalizer=None):
|
||||
Z=None, kernel=None, inference_method=None, likelihood=None,
|
||||
name='bayesian gplvm', mpi_comm=None, normalizer=None,
|
||||
missing_data=False):
|
||||
self.mpi_comm = mpi_comm
|
||||
self.__IN_OPTIMIZATION__ = False
|
||||
|
||||
|
|
@ -59,24 +61,23 @@ class BayesianGPLVM(SparseGP_MPI):
|
|||
X = NormalPosterior(X, X_variance)
|
||||
|
||||
if inference_method is None:
|
||||
inan = np.isnan(Y)
|
||||
if np.any(inan):
|
||||
from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
|
||||
self.logger.debug("creating inference_method with var_dtc missing data")
|
||||
inference_method = VarDTCMissingData(inan=inan)
|
||||
elif mpi_comm is not None:
|
||||
if mpi_comm is not None:
|
||||
inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
|
||||
else:
|
||||
from ..inference.latent_function_inference.var_dtc import VarDTC
|
||||
self.logger.debug("creating inference_method var_dtc")
|
||||
inference_method = VarDTC()
|
||||
inference_method = VarDTC(limit=1 if not missing_data else Y.shape[1])
|
||||
if isinstance(inference_method,VarDTC_minibatch):
|
||||
inference_method.mpi_comm = mpi_comm
|
||||
|
||||
if kernel.useGPU and isinstance(inference_method, VarDTC_GPU):
|
||||
kernel.psicomp.GPU_direct = True
|
||||
|
||||
super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood, name=name, inference_method=inference_method, normalizer=normalizer, mpi_comm=mpi_comm, variational_prior=self.variational_prior)
|
||||
super(BayesianGPLVM,self).__init__(X, Y, Z, kernel, likelihood=likelihood,
|
||||
name=name, inference_method=inference_method,
|
||||
normalizer=normalizer, mpi_comm=mpi_comm,
|
||||
variational_prior=self.variational_prior,
|
||||
missing_data=missing_data)
|
||||
|
||||
def set_X_gradients(self, X, X_grad):
|
||||
"""Set the gradients of the posterior distribution of X in its specific form."""
|
||||
|
|
@ -86,15 +87,48 @@ class BayesianGPLVM(SparseGP_MPI):
|
|||
"""Get the gradients of the posterior distribution of X in its specific form."""
|
||||
return X.mean.gradient, X.variance.gradient
|
||||
|
||||
def _inner_parameters_changed(self, kern, X, Z, likelihood, Y, Y_metadata, Lm=None, dL_dKmm=None, subset_indices=None):
|
||||
posterior, log_marginal_likelihood, grad_dict, current_values, value_indices = super(BayesianGPLVM, self)._inner_parameters_changed(kern, X, Z, likelihood, Y, Y_metadata, Lm=Lm, dL_dKmm=dL_dKmm, subset_indices=subset_indices)
|
||||
|
||||
log_marginal_likelihood -= self.variational_prior.KL_divergence(X)
|
||||
|
||||
current_values['meangrad'], current_values['vargrad'] = self.kern.gradients_qX_expectations(
|
||||
variational_posterior=X,
|
||||
Z=Z, dL_dpsi0=grad_dict['dL_dpsi0'],
|
||||
dL_dpsi1=grad_dict['dL_dpsi1'],
|
||||
dL_dpsi2=grad_dict['dL_dpsi2'])
|
||||
X.mean.gradient[:] = 0
|
||||
X.variance.gradient[:] = 0
|
||||
self.variational_prior.update_gradients_KL(X)
|
||||
current_values['meangrad'] += X.mean.gradient
|
||||
current_values['vargrad'] += X.variance.gradient
|
||||
|
||||
value_indices['meangrad'] = subset_indices['samples']
|
||||
value_indices['vargrad'] = subset_indices['samples']
|
||||
return posterior, log_marginal_likelihood, grad_dict, current_values, value_indices
|
||||
|
||||
def _outer_values_update(self, full_values):
|
||||
"""
|
||||
Here you put the values, which were collected before in the right places.
|
||||
E.g. set the gradients of parameters, etc.
|
||||
"""
|
||||
super(BayesianGPLVM, self)._outer_values_update(full_values)
|
||||
self.X.mean.gradient = full_values['meangrad']
|
||||
self.X.variance.gradient = full_values['vargrad']
|
||||
|
||||
def _outer_init_full_values(self):
|
||||
return dict(meangrad=np.zeros(self.X.mean.shape),
|
||||
vargrad=np.zeros(self.X.variance.shape))
|
||||
|
||||
def parameters_changed(self):
|
||||
super(BayesianGPLVM,self).parameters_changed()
|
||||
if isinstance(self.inference_method, VarDTC_minibatch):
|
||||
return
|
||||
|
||||
super(BayesianGPLVM, self).parameters_changed()
|
||||
self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
|
||||
#super(BayesianGPLVM, self).parameters_changed()
|
||||
#self._log_marginal_likelihood -= self.variational_prior.KL_divergence(self.X)
|
||||
|
||||
self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])
|
||||
#self.X.mean.gradient, self.X.variance.gradient = self.kern.gradients_qX_expectations(variational_posterior=self.X, Z=self.Z, dL_dpsi0=self.grad_dict['dL_dpsi0'], dL_dpsi1=self.grad_dict['dL_dpsi1'], dL_dpsi2=self.grad_dict['dL_dpsi2'])
|
||||
|
||||
# This is testing code -------------------------
|
||||
# i = np.random.randint(self.X.shape[0])
|
||||
|
|
@ -113,7 +147,7 @@ class BayesianGPLVM(SparseGP_MPI):
|
|||
# -----------------------------------------------
|
||||
|
||||
# update for the KL divergence
|
||||
self.variational_prior.update_gradients_KL(self.X)
|
||||
#self.variational_prior.update_gradients_KL(self.X)
|
||||
|
||||
def plot_latent(self, labels=None, which_indices=None,
|
||||
resolution=50, ax=None, marker='o', s=40,
|
||||
|
|
@ -178,7 +212,7 @@ class BayesianGPLVM(SparseGP_MPI):
|
|||
"""
|
||||
dmu_dX = np.zeros_like(Xnew)
|
||||
for i in range(self.Z.shape[0]):
|
||||
dmu_dX += self.kern.gradients_X(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
|
||||
dmu_dX += self.kern.gradients_X(self.grad_dict['dL_dpsi1'][i:i + 1, :], Xnew, self.Z[i:i + 1, :])
|
||||
return dmu_dX
|
||||
|
||||
def dmu_dXnew(self, Xnew):
|
||||
|
|
@ -189,7 +223,7 @@ class BayesianGPLVM(SparseGP_MPI):
|
|||
ones = np.ones((1, 1))
|
||||
for i in range(self.Z.shape[0]):
|
||||
gradients_X[:, i] = self.kern.gradients_X(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
|
||||
return np.dot(gradients_X, self.Cpsi1Vf)
|
||||
return np.dot(gradients_X, self.grad_dict['dL_dpsi1'])
|
||||
|
||||
def plot_steepest_gradient_map(self, *args, ** kwargs):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -20,7 +20,7 @@ class MiscTests(unittest.TestCase):
|
|||
m = GPy.models.GPRegression(self.X, self.Y, kernel=k)
|
||||
m.randomize()
|
||||
m.likelihood.variance = .5
|
||||
Kinv = np.linalg.pinv(k.K(self.X) + np.eye(self.N)*m.likelihood.variance)
|
||||
Kinv = np.linalg.pinv(k.K(self.X) + np.eye(self.N) * m.likelihood.variance)
|
||||
K_hat = k.K(self.X_new) - k.K(self.X_new, self.X).dot(Kinv).dot(k.K(self.X, self.X_new))
|
||||
mu_hat = k.K(self.X_new, self.X).dot(Kinv).dot(m.Y_normalized)
|
||||
|
||||
|
|
@ -43,7 +43,7 @@ class MiscTests(unittest.TestCase):
|
|||
Z = m.Z[:]
|
||||
X = self.X[:]
|
||||
|
||||
#Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
|
||||
# Not easy to check if woodbury_inv is correct in itself as it requires a large derivation and expression
|
||||
Kinv = m.posterior.woodbury_inv
|
||||
K_hat = k.K(self.X_new) - k.K(self.X_new, Z).dot(Kinv).dot(k.K(Z, self.X_new))
|
||||
|
||||
|
|
@ -51,13 +51,13 @@ class MiscTests(unittest.TestCase):
|
|||
self.assertEquals(mu.shape, (self.N_new, self.D))
|
||||
self.assertEquals(covar.shape, (self.N_new, self.N_new))
|
||||
np.testing.assert_almost_equal(K_hat, covar)
|
||||
#np.testing.assert_almost_equal(mu_hat, mu)
|
||||
# np.testing.assert_almost_equal(mu_hat, mu)
|
||||
|
||||
mu, var = m._raw_predict(self.X_new)
|
||||
self.assertEquals(mu.shape, (self.N_new, self.D))
|
||||
self.assertEquals(var.shape, (self.N_new, 1))
|
||||
np.testing.assert_almost_equal(np.diag(K_hat)[:, None], var)
|
||||
#np.testing.assert_almost_equal(mu_hat, mu)
|
||||
# np.testing.assert_almost_equal(mu_hat, mu)
|
||||
|
||||
def test_likelihood_replicate(self):
|
||||
m = GPy.models.GPRegression(self.X, self.Y)
|
||||
|
|
@ -110,6 +110,29 @@ class MiscTests(unittest.TestCase):
|
|||
m2.kern.lengthscale = m.kern['.*lengthscale']
|
||||
np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())
|
||||
|
||||
def test_missing_data(self):
|
||||
from GPy import kern
|
||||
from GPy.models import BayesianGPLVM
|
||||
from GPy.examples.dimensionality_reduction import _simulate_matern
|
||||
|
||||
D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 400, 3, 4
|
||||
_, _, Ylist = _simulate_matern(D1, D2, D3, N, num_inducing, False)
|
||||
Y = Ylist[0]
|
||||
|
||||
inan = np.random.binomial(1, .9, size=Y.shape).astype(bool) # 80% missing data
|
||||
Ymissing = Y.copy()
|
||||
Ymissing[inan] = np.nan
|
||||
|
||||
k = kern.Linear(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
|
||||
m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
|
||||
kernel=k, missing_data=True)
|
||||
assert(m.checkgrad())
|
||||
|
||||
k = kern.RBF(Q, ARD=True) + kern.White(Q, np.exp(-2)) # + kern.bias(Q)
|
||||
m = BayesianGPLVM(Ymissing, Q, init="random", num_inducing=num_inducing,
|
||||
kernel=k, missing_data=True)
|
||||
assert(m.checkgrad())
|
||||
|
||||
def test_likelihood_replicate_kern(self):
|
||||
m = GPy.models.GPRegression(self.X, self.Y)
|
||||
m2 = GPy.models.GPRegression(self.X, self.Y)
|
||||
|
|
@ -158,7 +181,7 @@ class MiscTests(unittest.TestCase):
|
|||
def test_model_optimize(self):
|
||||
X = np.random.uniform(-3., 3., (20, 1))
|
||||
Y = np.sin(X) + np.random.randn(20, 1) * 0.05
|
||||
m = GPy.models.GPRegression(X,Y)
|
||||
m = GPy.models.GPRegression(X, Y)
|
||||
m.optimize()
|
||||
print m
|
||||
|
||||
|
|
@ -219,8 +242,8 @@ class GradientTests(np.testing.TestCase):
|
|||
mlp = GPy.kern.MLP(1)
|
||||
self.check_model(mlp, model_type='GPRegression', dimension=1)
|
||||
|
||||
#TODO:
|
||||
#def test_GPRegression_poly_1d(self):
|
||||
# TODO:
|
||||
# def test_GPRegression_poly_1d(self):
|
||||
# ''' Testing the GP regression with polynomial kernel with white kernel on 1d data '''
|
||||
# mlp = GPy.kern.Poly(1, degree=5)
|
||||
# self.check_model(mlp, model_type='GPRegression', dimension=1)
|
||||
|
|
@ -417,11 +440,11 @@ class GradientTests(np.testing.TestCase):
|
|||
|
||||
def test_gp_heteroscedastic_regression(self):
|
||||
num_obs = 25
|
||||
X = np.random.randint(0,140,num_obs)
|
||||
X = X[:,None]
|
||||
Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None]
|
||||
X = np.random.randint(0, 140, num_obs)
|
||||
X = X[:, None]
|
||||
Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
|
||||
kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
|
||||
m = GPy.models.GPHeteroscedasticRegression(X,Y,kern)
|
||||
m = GPy.models.GPHeteroscedasticRegression(X, Y, kern)
|
||||
self.assertTrue(m.checkgrad())
|
||||
|
||||
def test_gp_kronecker_gaussian(self):
|
||||
|
|
@ -432,12 +455,12 @@ class GradientTests(np.testing.TestCase):
|
|||
k1 = GPy.kern.RBF(1) # + GPy.kern.White(1)
|
||||
k2 = GPy.kern.RBF(1) # + GPy.kern.White(1)
|
||||
Y = np.random.randn(N1, N2)
|
||||
Y = Y-Y.mean(0)
|
||||
Y = Y/Y.std(0)
|
||||
Y = Y - Y.mean(0)
|
||||
Y = Y / Y.std(0)
|
||||
m = GPy.models.GPKroneckerGaussianRegression(X1, X2, Y, k1, k2)
|
||||
|
||||
# build the model the dumb way
|
||||
assert (N1*N2<1000), "too much data for standard GPs!"
|
||||
assert (N1 * N2 < 1000), "too much data for standard GPs!"
|
||||
yy, xx = np.meshgrid(X2, X1)
|
||||
Xgrid = np.vstack((xx.flatten(order='F'), yy.flatten(order='F'))).T
|
||||
kg = GPy.kern.RBF(1, active_dims=[0]) * GPy.kern.RBF(1, active_dims=[1])
|
||||
|
|
@ -458,11 +481,11 @@ class GradientTests(np.testing.TestCase):
|
|||
|
||||
def test_gp_VGPC(self):
|
||||
num_obs = 25
|
||||
X = np.random.randint(0,140,num_obs)
|
||||
X = X[:,None]
|
||||
Y = 25. + np.sin(X/20.) * 2. + np.random.rand(num_obs)[:,None]
|
||||
X = np.random.randint(0, 140, num_obs)
|
||||
X = X[:, None]
|
||||
Y = 25. + np.sin(X / 20.) * 2. + np.random.rand(num_obs)[:, None]
|
||||
kern = GPy.kern.Bias(1) + GPy.kern.RBF(1)
|
||||
m = GPy.models.GPVariationalGaussianApproximation(X,Y,kern)
|
||||
m = GPy.models.GPVariationalGaussianApproximation(X, Y, kern)
|
||||
self.assertTrue(m.checkgrad())
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue