[mrd] more control for init, some missing data adjustements, init greatly improved

This commit is contained in:
Max Zwiessele 2014-05-20 14:49:20 +01:00
parent 995de897b3
commit ef256223d1

View file

@ -9,18 +9,25 @@ from ..core import Model
from ..kern import Kern from ..kern import Kern
from ..core.parameterization.variational import NormalPosterior, NormalPrior from ..core.parameterization.variational import NormalPosterior, NormalPrior
from ..core.parameterization import Param, Parameterized from ..core.parameterization import Param, Parameterized
from ..core.parameterization.observable_array import ObsAr
from ..inference.latent_function_inference.var_dtc import VarDTCMissingData, VarDTC from ..inference.latent_function_inference.var_dtc import VarDTCMissingData, VarDTC
from ..inference.latent_function_inference import InferenceMethodList
from ..likelihoods import Gaussian from ..likelihoods import Gaussian
from ..util.initialization import initialize_latent from ..util.initialization import initialize_latent
from ..core.sparse_gp import SparseGP, GP from ..core.sparse_gp import SparseGP, GP
from ..inference.latent_function_inference import InferenceMethodList
class MRD(SparseGP): class MRD(SparseGP):
""" """
!WARNING: This is bleeding edge code and still in development.
Functionality may change fundamentally during development!
Apply MRD to all given datasets Y in Ylist. Apply MRD to all given datasets Y in Ylist.
Y_i in [n x p_i] Y_i in [n x p_i]
If Ylist is a dictionary, the keys of the dictionary are the names, and the
values are the different datasets to compare.
The samples n in the datasets need The samples n in the datasets need
to match up, whereas the dimensionality p_d can differ. to match up, whereas the dimensionality p_d can differ.
@ -57,16 +64,46 @@ class MRD(SparseGP):
self.input_dim = input_dim self.input_dim = input_dim
self.num_inducing = num_inducing self.num_inducing = num_inducing
self.Ylist = Ylist if isinstance(Ylist, dict):
Ynames, Ylist = zip(*Ylist.items())
self.Ylist = [ObsAr(Y) for Y in Ylist]
if Ynames is None:
Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
self.names = Ynames
assert len(self.names) == len(self.Ylist), "one name per dataset, or None if Ylist is a dict"
if inference_method is None:
self.inference_method= InferenceMethodList()
warned = False
for y in Ylist:
inan = np.isnan(y)
if np.any(inan):
if not warned:
print "WARING: NaN values detected, make sure initx method can cope with NaN values or provide starting latent space X"
warned = True
self.inference_method.append(VarDTCMissingData(limit=1, inan=inan))
else:
self.inference_method.append(VarDTC(limit=1))
else:
if not isinstance(inference_method, InferenceMethodList):
inference_method = InferenceMethodList(inference_method)
self.inference_method = inference_method
self._in_init_ = True self._in_init_ = True
X, fracs = self._init_X(initx, Ylist) if X is None:
X, fracs = self._init_X(initx, Ylist)
else:
fracs = [X.var(0)]*len(Ylist)
self.Z = Param('inducing inputs', self._init_Z(initz, X)) self.Z = Param('inducing inputs', self._init_Z(initz, X))
self.num_inducing = self.Z.shape[0] # ensure M==N if M>N self.num_inducing = self.Z.shape[0] # ensure M==N if M>N
# sort out the kernels # sort out the kernels
if kernel is None: if kernel is None:
from ..kern import RBF from ..kern import RBF
self.kernels = [RBF(input_dim, ARD=1, lengthscale=fracs[i], name='rbf'.format(i)) for i in range(len(Ylist))] self.kernels = [RBF(input_dim, ARD=1, lengthscale=fracs[i]) for i in range(len(Ylist))]
elif isinstance(kernel, Kern): elif isinstance(kernel, Kern):
self.kernels = [] self.kernels = []
for i in range(len(Ylist)): for i in range(len(Ylist)):
@ -87,25 +124,8 @@ class MRD(SparseGP):
self.likelihoods = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))] self.likelihoods = [Gaussian(name='Gaussian_noise'.format(i)) for i in range(len(Ylist))]
else: self.likelihoods = likelihoods else: self.likelihoods = likelihoods
if inference_method is None:
self.inference_method= InferenceMethodList()
for y in Ylist:
inan = np.isnan(y)
if np.any(inan):
self.inference_method.append(VarDTCMissingData(limit=1, inan=inan))
else:
self.inference_method.append(VarDTC(limit=1))
else:
if not isinstance(inference_method, InferenceMethodList):
inference_method = InferenceMethodList(inference_method)
self.inference_method = inference_method
self.add_parameters(self.X, self.Z) self.add_parameters(self.X, self.Z)
if Ynames is None:
Ynames = ['Y{}'.format(i) for i in range(len(Ylist))]
self.names = Ynames
self.bgplvms = [] self.bgplvms = []
self.num_data = Ylist[0].shape[0] self.num_data = Ylist[0].shape[0]
@ -173,7 +193,7 @@ class MRD(SparseGP):
Ylist = self.Ylist Ylist = self.Ylist
if init in "PCA_concat": if init in "PCA_concat":
X, fracs = initialize_latent('PCA', self.input_dim, np.hstack(Ylist)) X, fracs = initialize_latent('PCA', self.input_dim, np.hstack(Ylist))
fracs = [fracs]*self.input_dim fracs = [fracs]*len(Ylist)
elif init in "PCA_single": elif init in "PCA_single":
X = np.zeros((Ylist[0].shape[0], self.input_dim)) X = np.zeros((Ylist[0].shape[0], self.input_dim))
fracs = [] fracs = []
@ -184,7 +204,7 @@ class MRD(SparseGP):
else: # init == 'random': else: # init == 'random':
X = np.random.randn(Ylist[0].shape[0], self.input_dim) X = np.random.randn(Ylist[0].shape[0], self.input_dim)
fracs = X.var(0) fracs = X.var(0)
fracs = [fracs]*self.input_dim fracs = [fracs]*len(Ylist)
X -= X.mean() X -= X.mean()
X /= X.std() X /= X.std()
return X, fracs return X, fracs