mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-06-11 15:15:15 +02:00
messy merge
This commit is contained in:
commit
89b8b0d298
16 changed files with 352 additions and 99 deletions
|
|
@ -124,6 +124,7 @@ class GP(Model):
|
|||
else:
|
||||
self.X = ObsAr(X)
|
||||
self.update_model(True)
|
||||
self._trigger_params_changed()
|
||||
|
||||
def set_X(self,X):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -213,7 +213,7 @@ class Model(Parameterized):
|
|||
self.obj_grads = np.clip(self._transform_gradients(self.objective_function_gradients()), -1e10, 1e10)
|
||||
return obj_f, self.obj_grads
|
||||
|
||||
def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=False, **kwargs):
|
||||
def optimize(self, optimizer=None, start=None, messages=False, max_iters=1000, ipython_notebook=True, **kwargs):
|
||||
"""
|
||||
Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
|
||||
|
||||
|
|
@ -255,7 +255,7 @@ class Model(Parameterized):
|
|||
else:
|
||||
optimizer = optimization.get_optimizer(optimizer)
|
||||
opt = optimizer(start, model=self, max_iters=max_iters, **kwargs)
|
||||
|
||||
|
||||
with VerboseOptimization(self, opt, maxiters=max_iters, verbose=messages, ipython_notebook=ipython_notebook) as vo:
|
||||
opt.run(f_fp=self._objective_grads, f=self._objective, fp=self._grads)
|
||||
vo.finish(opt)
|
||||
|
|
@ -402,7 +402,7 @@ class Model(Parameterized):
|
|||
model_details = [['<b>Model</b>', self.name + '<br>'],
|
||||
['<b>Log-likelihood</b>', '{}<br>'.format(float(self.log_likelihood()))],
|
||||
["<b>Number of Parameters</b>", '{}<br>'.format(self.size)],
|
||||
["<b>Updates</b>", '{}<br>'.format(self._updates)],
|
||||
["<b>Updates</b>", '{}<br>'.format(self._update_on)],
|
||||
]
|
||||
from operator import itemgetter
|
||||
to_print = ["""<style type="text/css">
|
||||
|
|
@ -419,7 +419,7 @@ class Model(Parameterized):
|
|||
model_details = [['Name', self.name],
|
||||
['Log-likelihood', '{}'.format(float(self.log_likelihood()))],
|
||||
["Number of Parameters", '{}'.format(self.size)],
|
||||
["Updates", '{}'.format(self._updates)],
|
||||
["Updates", '{}'.format(self._update_on)],
|
||||
]
|
||||
from operator import itemgetter
|
||||
max_len = reduce(lambda a, b: max(len(b[0]), a), model_details, 0)
|
||||
|
|
|
|||
|
|
@ -1042,6 +1042,9 @@ class Parameterizable(OptimizationHandlable):
|
|||
p = param_to_array(p)
|
||||
d = f.create_dataset(n,p.shape,dtype=p.dtype)
|
||||
d[:] = p
|
||||
if hasattr(self, 'param_array'):
|
||||
d = f.create_dataset('param_array',self.param_array.shape, dtype=self.param_array.dtype)
|
||||
d[:] = self.param_array
|
||||
f.close()
|
||||
except:
|
||||
raise 'Fails to write the parameters into a HDF5 file!'
|
||||
|
|
|
|||
|
|
@ -549,7 +549,8 @@ class DGPLVM(Prior):
|
|||
M_i = np.zeros((self.classnum, self.dim))
|
||||
for i in cls:
|
||||
# Mean of each class
|
||||
M_i[i] = np.mean(cls[i], axis=0)
|
||||
class_i = cls[i]
|
||||
M_i[i] = np.mean(class_i, axis=0)
|
||||
return M_i
|
||||
|
||||
# Adding data points as tuple to the dictionary so that we can access indices
|
||||
|
|
@ -661,7 +662,8 @@ class DGPLVM(Prior):
|
|||
Sw = self.compute_Sw(cls, M_i)
|
||||
# Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
|
||||
#Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
|
||||
Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
|
||||
#Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
|
||||
Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
|
||||
return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
|
||||
|
||||
# This function calculates derivative of the log of prior function
|
||||
|
|
@ -680,8 +682,9 @@ class DGPLVM(Prior):
|
|||
|
||||
# Calculating inverse of Sb and its transpose and minus
|
||||
# Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
|
||||
# Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
|
||||
Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
|
||||
#Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
|
||||
#Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
|
||||
Sb_inv_N = pdinv(Sb + np.eye(Sb.shape[0])*0.1)[0]
|
||||
Sb_inv_N_trans = np.transpose(Sb_inv_N)
|
||||
Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
|
||||
Sw_trans = np.transpose(Sw)
|
||||
|
|
@ -706,7 +709,230 @@ class DGPLVM(Prior):
|
|||
return np.random.rand(n) # A WRONG implementation
|
||||
|
||||
def __str__(self):
|
||||
return 'DGPLVM_prior'
|
||||
return 'DGPLVM_prior_Raq'
|
||||
|
||||
|
||||
|
||||
class DGPLVM_T(Prior):
|
||||
"""
|
||||
Implementation of the Discriminative Gaussian Process Latent Variable model paper, by Raquel.
|
||||
|
||||
:param sigma2: constant
|
||||
|
||||
.. Note:: DGPLVM for Classification paper implementation
|
||||
|
||||
"""
|
||||
domain = _REAL
|
||||
# _instances = []
|
||||
# def __new__(cls, mu, sigma): # Singleton:
|
||||
# if cls._instances:
|
||||
# cls._instances[:] = [instance for instance in cls._instances if instance()]
|
||||
# for instance in cls._instances:
|
||||
# if instance().mu == mu and instance().sigma == sigma:
|
||||
# return instance()
|
||||
# o = super(Prior, cls).__new__(cls, mu, sigma)
|
||||
# cls._instances.append(weakref.ref(o))
|
||||
# return cls._instances[-1]()
|
||||
|
||||
def __init__(self, sigma2, lbl, x_shape, vec):
|
||||
self.sigma2 = sigma2
|
||||
# self.x = x
|
||||
self.lbl = lbl
|
||||
self.classnum = lbl.shape[1]
|
||||
self.datanum = lbl.shape[0]
|
||||
self.x_shape = x_shape
|
||||
self.dim = x_shape[1]
|
||||
self.vec = vec
|
||||
|
||||
|
||||
def get_class_label(self, y):
|
||||
for idx, v in enumerate(y):
|
||||
if v == 1:
|
||||
return idx
|
||||
return -1
|
||||
|
||||
# This function assigns each data point to its own class
|
||||
# and returns the dictionary which contains the class name and parameters.
|
||||
def compute_cls(self, x):
|
||||
cls = {}
|
||||
# Appending each data point to its proper class
|
||||
for j in xrange(self.datanum):
|
||||
class_label = self.get_class_label(self.lbl[j])
|
||||
if class_label not in cls:
|
||||
cls[class_label] = []
|
||||
cls[class_label].append(x[j])
|
||||
return cls
|
||||
|
||||
# This function computes mean of each class. The mean is calculated through each dimension
|
||||
def compute_Mi(self, cls, vec):
|
||||
M_i = np.zeros((self.classnum, self.dim))
|
||||
for i in cls:
|
||||
# Mean of each class
|
||||
class_i = np.multiply(cls[i],vec)
|
||||
M_i[i] = np.mean(class_i, axis=0)
|
||||
return M_i
|
||||
|
||||
# Adding data points as tuple to the dictionary so that we can access indices
|
||||
def compute_indices(self, x):
|
||||
data_idx = {}
|
||||
for j in xrange(self.datanum):
|
||||
class_label = self.get_class_label(self.lbl[j])
|
||||
if class_label not in data_idx:
|
||||
data_idx[class_label] = []
|
||||
t = (j, x[j])
|
||||
data_idx[class_label].append(t)
|
||||
return data_idx
|
||||
|
||||
# Adding indices to the list so we can access whole the indices
|
||||
def compute_listIndices(self, data_idx):
|
||||
lst_idx = []
|
||||
lst_idx_all = []
|
||||
for i in data_idx:
|
||||
if len(lst_idx) == 0:
|
||||
pass
|
||||
#Do nothing, because it is the first time list is created so is empty
|
||||
else:
|
||||
lst_idx = []
|
||||
# Here we put indices of each class in to the list called lst_idx_all
|
||||
for m in xrange(len(data_idx[i])):
|
||||
lst_idx.append(data_idx[i][m][0])
|
||||
lst_idx_all.append(lst_idx)
|
||||
return lst_idx_all
|
||||
|
||||
# This function calculates between classes variances
|
||||
def compute_Sb(self, cls, M_i, M_0):
|
||||
Sb = np.zeros((self.dim, self.dim))
|
||||
for i in cls:
|
||||
B = (M_i[i] - M_0).reshape(self.dim, 1)
|
||||
B_trans = B.transpose()
|
||||
Sb += (float(len(cls[i])) / self.datanum) * B.dot(B_trans)
|
||||
return Sb
|
||||
|
||||
# This function calculates within classes variances
|
||||
def compute_Sw(self, cls, M_i):
|
||||
Sw = np.zeros((self.dim, self.dim))
|
||||
for i in cls:
|
||||
N_i = float(len(cls[i]))
|
||||
W_WT = np.zeros((self.dim, self.dim))
|
||||
for xk in cls[i]:
|
||||
W = (xk - M_i[i])
|
||||
W_WT += np.outer(W, W)
|
||||
Sw += (N_i / self.datanum) * ((1. / N_i) * W_WT)
|
||||
return Sw
|
||||
|
||||
# Calculating beta and Bi for Sb
|
||||
def compute_sig_beta_Bi(self, data_idx, M_i, M_0, lst_idx_all):
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
B_i = np.zeros((self.classnum, self.dim))
|
||||
Sig_beta_B_i_all = np.zeros((self.datanum, self.dim))
|
||||
for i in data_idx:
|
||||
# pdb.set_trace()
|
||||
# Calculating Bi
|
||||
B_i[i] = (M_i[i] - M_0).reshape(1, self.dim)
|
||||
for k in xrange(self.datanum):
|
||||
for i in data_idx:
|
||||
N_i = float(len(data_idx[i]))
|
||||
if k in lst_idx_all[i]:
|
||||
beta = (float(1) / N_i) - (float(1) / self.datanum)
|
||||
Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
|
||||
else:
|
||||
beta = -(float(1) / self.datanum)
|
||||
Sig_beta_B_i_all[k] += float(N_i) / self.datanum * (beta * B_i[i])
|
||||
Sig_beta_B_i_all = Sig_beta_B_i_all.transpose()
|
||||
return Sig_beta_B_i_all
|
||||
|
||||
|
||||
# Calculating W_j s separately so we can access all the W_j s anytime
|
||||
def compute_wj(self, data_idx, M_i):
|
||||
W_i = np.zeros((self.datanum, self.dim))
|
||||
for i in data_idx:
|
||||
N_i = float(len(data_idx[i]))
|
||||
for tpl in data_idx[i]:
|
||||
xj = tpl[1]
|
||||
j = tpl[0]
|
||||
W_i[j] = (xj - M_i[i])
|
||||
return W_i
|
||||
|
||||
# Calculating alpha and Wj for Sw
|
||||
def compute_sig_alpha_W(self, data_idx, lst_idx_all, W_i):
|
||||
Sig_alpha_W_i = np.zeros((self.datanum, self.dim))
|
||||
for i in data_idx:
|
||||
N_i = float(len(data_idx[i]))
|
||||
for tpl in data_idx[i]:
|
||||
k = tpl[0]
|
||||
for j in lst_idx_all[i]:
|
||||
if k == j:
|
||||
alpha = 1 - (float(1) / N_i)
|
||||
Sig_alpha_W_i[k] += (alpha * W_i[j])
|
||||
else:
|
||||
alpha = 0 - (float(1) / N_i)
|
||||
Sig_alpha_W_i[k] += (alpha * W_i[j])
|
||||
Sig_alpha_W_i = (1. / self.datanum) * np.transpose(Sig_alpha_W_i)
|
||||
return Sig_alpha_W_i
|
||||
|
||||
# This function calculates log of our prior
|
||||
def lnpdf(self, x):
|
||||
x = x.reshape(self.x_shape)
|
||||
cls = self.compute_cls(x)
|
||||
M_0 = np.mean(x, axis=0)
|
||||
M_i = self.compute_Mi(cls, self.vec)
|
||||
Sb = self.compute_Sb(cls, M_i, M_0)
|
||||
Sw = self.compute_Sw(cls, M_i)
|
||||
# Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
|
||||
#Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
|
||||
#print 'SB_inv: ', Sb_inv_N
|
||||
#Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
|
||||
Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
|
||||
return (-1 / self.sigma2) * np.trace(Sb_inv_N.dot(Sw))
|
||||
|
||||
# This function calculates derivative of the log of prior function
|
||||
def lnpdf_grad(self, x):
|
||||
x = x.reshape(self.x_shape)
|
||||
cls = self.compute_cls(x)
|
||||
M_0 = np.mean(x, axis=0)
|
||||
M_i = self.compute_Mi(cls, self.vec)
|
||||
Sb = self.compute_Sb(cls, M_i, M_0)
|
||||
Sw = self.compute_Sw(cls, M_i)
|
||||
data_idx = self.compute_indices(x)
|
||||
lst_idx_all = self.compute_listIndices(data_idx)
|
||||
Sig_beta_B_i_all = self.compute_sig_beta_Bi(data_idx, M_i, M_0, lst_idx_all)
|
||||
W_i = self.compute_wj(data_idx, M_i)
|
||||
Sig_alpha_W_i = self.compute_sig_alpha_W(data_idx, lst_idx_all, W_i)
|
||||
|
||||
# Calculating inverse of Sb and its transpose and minus
|
||||
# Sb_inv_N = np.linalg.inv(Sb + np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))
|
||||
#Sb_inv_N = np.linalg.inv(Sb+np.eye(Sb.shape[0])*0.1)
|
||||
#print 'SB_inv: ',Sb_inv_N
|
||||
#Sb_inv_N = pdinv(Sb+ np.eye(Sb.shape[0]) * (np.diag(Sb).min() * 0.1))[0]
|
||||
Sb_inv_N = pdinv(Sb+np.eye(Sb.shape[0])*0.1)[0]
|
||||
Sb_inv_N_trans = np.transpose(Sb_inv_N)
|
||||
Sb_inv_N_trans_minus = -1 * Sb_inv_N_trans
|
||||
Sw_trans = np.transpose(Sw)
|
||||
|
||||
# Calculating DJ/DXk
|
||||
DJ_Dxk = 2 * (
|
||||
Sb_inv_N_trans_minus.dot(Sw_trans).dot(Sb_inv_N_trans).dot(Sig_beta_B_i_all) + Sb_inv_N_trans.dot(
|
||||
Sig_alpha_W_i))
|
||||
# Calculating derivative of the log of the prior
|
||||
DPx_Dx = ((-1 / self.sigma2) * DJ_Dxk)
|
||||
return DPx_Dx.T
|
||||
|
||||
# def frb(self, x):
|
||||
# from functools import partial
|
||||
# from GPy.models import GradientChecker
|
||||
# f = partial(self.lnpdf)
|
||||
# df = partial(self.lnpdf_grad)
|
||||
# grad = GradientChecker(f, df, x, 'X')
|
||||
# grad.checkgrad(verbose=1)
|
||||
|
||||
def rvs(self, n):
|
||||
return np.random.rand(n) # A WRONG implementation
|
||||
|
||||
def __str__(self):
|
||||
return 'DGPLVM_prior_Raq_TTT'
|
||||
|
||||
|
||||
|
||||
class HalfT(Prior):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -11,7 +11,6 @@ class Updateable(Observable):
|
|||
A model can be updated or not.
|
||||
Make sure updates can be switched on and off.
|
||||
"""
|
||||
_updates = True
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Updateable, self).__init__(*args, **kwargs)
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,8 @@ from gp import GP
|
|||
from parameterization.param import Param
|
||||
from ..inference.latent_function_inference import var_dtc
|
||||
from .. import likelihoods
|
||||
from parameterization.variational import VariationalPosterior
|
||||
from parameterization.variational import VariationalPosterior, NormalPosterior
|
||||
from ..util.linalg import mdot
|
||||
|
||||
import logging
|
||||
from GPy.inference.latent_function_inference.posterior import Posterior
|
||||
|
|
@ -102,7 +103,15 @@ class SparseGP(GP):
|
|||
|
||||
def _raw_predict(self, Xnew, full_cov=False, kern=None):
|
||||
"""
|
||||
Make a prediction for the latent function values
|
||||
Make a prediction for the latent function values.
|
||||
|
||||
For certain inputs we give back a full_cov of shape NxN,
|
||||
if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
|
||||
we take only the diagonal elements across N.
|
||||
|
||||
For uncertain inputs, the SparseGP bound produces a full covariance structure across D, so for full_cov we
|
||||
return a NxDxD matrix and in the not full_cov case, we return the diagonal elements across D (NxD).
|
||||
This is for both with and without missing data.
|
||||
"""
|
||||
|
||||
if kern is None: kern = self.kern
|
||||
|
|
@ -121,15 +130,32 @@ class SparseGP(GP):
|
|||
Kxx = kern.Kdiag(Xnew)
|
||||
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
|
||||
else:
|
||||
Kx = kern.psi1(self.Z, Xnew).T
|
||||
mu = np.dot(Kx.T, self.posterior.woodbury_vector)
|
||||
if full_cov:
|
||||
Kxx = kern.K(Xnew.mean)
|
||||
if self.posterior.woodbury_inv.ndim == 2:
|
||||
var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
|
||||
elif self.posterior.woodbury_inv.ndim == 3:
|
||||
var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
|
||||
else:
|
||||
Kxx = kern.psi0(self.Z, Xnew)
|
||||
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
|
||||
psi0_star = self.kern.psi0(self.Z, Xnew)
|
||||
psi1_star = self.kern.psi1(self.Z, Xnew)
|
||||
#psi2_star = self.kern.psi2(self.Z, Xnew) # Only possible if we get NxMxM psi2 out of the code.
|
||||
la = self.posterior.woodbury_vector
|
||||
mu = np.dot(psi1_star, la) # TODO: dimensions?
|
||||
|
||||
if full_cov:
|
||||
var = np.empty((Xnew.shape[0], la.shape[1], la.shape[1]))
|
||||
di = np.diag_indices(la.shape[1])
|
||||
else:
|
||||
var = np.empty((Xnew.shape[0], la.shape[1]))
|
||||
|
||||
for i in range(Xnew.shape[0]):
|
||||
_mu, _var = Xnew.mean.values[[i]], Xnew.variance.values[[i]]
|
||||
psi2_star = self.kern.psi2(self.Z, NormalPosterior(_mu, _var))
|
||||
tmp = (psi2_star[:, :] - psi1_star[[i]].T.dot(psi1_star[[i]]))
|
||||
|
||||
var_ = mdot(la.T, tmp, la)
|
||||
p0 = psi0_star[i]
|
||||
t = np.atleast_3d(self.posterior.woodbury_inv)
|
||||
t2 = np.trace(t.T.dot(psi2_star), axis1=1, axis2=2)
|
||||
|
||||
if full_cov:
|
||||
var_[di] += p0
|
||||
var_[di] += -t2
|
||||
var[i] = var_
|
||||
else:
|
||||
var[i] = np.diag(var_)+p0-t2
|
||||
return mu, var
|
||||
|
|
|
|||
|
|
@ -26,23 +26,17 @@ class SVGP(SparseGP):
|
|||
Hensman, Matthews and Ghahramani, Scalable Variational GP Classification, ArXiv 1411.2005
|
||||
"""
|
||||
self.batchsize = batchsize
|
||||
self.X_all, self.Y_all = X, Y
|
||||
if batchsize is None:
|
||||
X_batch, Y_batch = X, Y
|
||||
KL_scale, batch_scale = 1., 1.
|
||||
|
||||
else:
|
||||
self.X_all, self.Y_all = X, Y
|
||||
# how to rescale the batch likelihood in case of minibatches
|
||||
batch_scale = float(self.X_all.shape[0])/float(self.batchsize)
|
||||
KL_scale = 1.0
|
||||
|
||||
import climin.util
|
||||
#Make a climin slicer to make drawing minibatches much quicker
|
||||
self.slicer = climin.util.draw_mini_slices(self.X_all.shape[0], self.batchsize)
|
||||
X_batch, Y_batch = self.new_batch()
|
||||
|
||||
#create the SVI inference method
|
||||
inf_method = svgp_inf(KL_scale=KL_scale, batch_scale=batch_scale)
|
||||
inf_method = svgp_inf()
|
||||
|
||||
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
|
||||
name=name, Y_metadata=Y_metadata, normalizer=False)
|
||||
|
|
@ -54,7 +48,7 @@ class SVGP(SparseGP):
|
|||
self.link_parameter(self.m)
|
||||
|
||||
def parameters_changed(self):
|
||||
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
|
||||
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
|
||||
|
||||
#update the kernel gradients
|
||||
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
|
||||
|
|
|
|||
|
|
@ -11,9 +11,8 @@ def exponents(fnow, current_grad):
|
|||
return np.sign(exps) * np.log10(exps).astype(int)
|
||||
|
||||
class VerboseOptimization(object):
|
||||
def __init__(self, model, opt, maxiters, verbose=True, current_iteration=0, ipython_notebook=False):
|
||||
def __init__(self, model, opt, maxiters, verbose=False, current_iteration=0, ipython_notebook=True):
|
||||
self.verbose = verbose
|
||||
self.ipython_notebook = ipython_notebook
|
||||
if self.verbose:
|
||||
self.model = model
|
||||
self.iteration = current_iteration
|
||||
|
|
@ -26,13 +25,18 @@ class VerboseOptimization(object):
|
|||
|
||||
self.update()
|
||||
|
||||
if self.ipython_notebook:
|
||||
try:
|
||||
from IPython.display import display
|
||||
from IPython.html.widgets import FloatProgressWidget, HTMLWidget, ContainerWidget
|
||||
self.text = HTMLWidget()
|
||||
self.progress = FloatProgressWidget()
|
||||
self.model_show = HTMLWidget()
|
||||
self.ipython_notebook = ipython_notebook
|
||||
except:
|
||||
# Not in Ipython notebook
|
||||
self.ipython_notebook = False
|
||||
|
||||
if self.ipython_notebook:
|
||||
self.text.set_css('width', '100%')
|
||||
#self.progress.set_css('width', '100%')
|
||||
|
||||
|
|
@ -140,6 +144,7 @@ class VerboseOptimization(object):
|
|||
self.print_out()
|
||||
|
||||
if not self.ipython_notebook:
|
||||
print
|
||||
print ''
|
||||
print 'Optimization finished in {0:.5g} Seconds'.format(self.stop-self.start)
|
||||
print 'Optimization status: {0:.5g}'.format(self.status)
|
||||
print
|
||||
|
|
|
|||
|
|
@ -5,11 +5,8 @@ import numpy as np
|
|||
from posterior import Posterior
|
||||
|
||||
class SVGP(LatentFunctionInference):
|
||||
def __init__(self, KL_scale=1., batch_scale=1.):
|
||||
self.KL_scale = KL_scale
|
||||
self.batch_scale = batch_scale
|
||||
|
||||
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None):
|
||||
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
|
||||
num_inducing = Z.shape[0]
|
||||
num_data, num_outputs = Y.shape
|
||||
|
||||
|
|
@ -44,9 +41,6 @@ class SVGP(LatentFunctionInference):
|
|||
dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
|
||||
dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
|
||||
|
||||
KL_scale = self.KL_scale
|
||||
batch_scale = self.batch_scale
|
||||
KL, dKL_dKmm, dKL_dS, dKL_dm = KL_scale*KL, KL_scale*dKL_dKmm, KL_scale*dKL_dS, KL_scale*dKL_dm
|
||||
|
||||
#quadrature for the likelihood
|
||||
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v)
|
||||
|
|
|
|||
|
|
@ -21,7 +21,7 @@ class VarDTC(LatentFunctionInference):
|
|||
For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
|
||||
|
||||
"""
|
||||
const_jitter = 1e-6
|
||||
const_jitter = 1e-8
|
||||
def __init__(self, limit=1):
|
||||
#self._YYTfactor_cache = caching.cache()
|
||||
from ...util.caching import Cacher
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ class VarDTC_minibatch(LatentFunctionInference):
|
|||
For efficiency, we sometimes work with the cholesky of Y*Y.T. To save repeatedly recomputing this, we cache it.
|
||||
|
||||
"""
|
||||
const_jitter = 1e-6
|
||||
const_jitter = 1e-8
|
||||
def __init__(self, batchsize=None, limit=1, mpi_comm=None):
|
||||
|
||||
self.batchsize = batchsize
|
||||
|
|
|
|||
|
|
@ -154,9 +154,9 @@ class Coregionalize(Kern):
|
|||
def _gradient_reduce_numpy(self, dL_dK, index, index2):
|
||||
index, index2 = index[:,0], index2[:,0]
|
||||
dL_dK_small = np.zeros_like(self.B)
|
||||
for i in range(k.output_dim):
|
||||
for i in range(self.output_dim):
|
||||
tmp1 = dL_dK[index==i]
|
||||
for j in range(k.output_dim):
|
||||
for j in range(self.output_dim):
|
||||
dL_dK_small[j,i] = tmp1[:,index2==j].sum()
|
||||
return dL_dK_small
|
||||
|
||||
|
|
|
|||
|
|
@ -42,25 +42,41 @@ class Prod(CombinationKernel):
|
|||
return reduce(np.multiply, (p.Kdiag(X) for p in which_parts))
|
||||
|
||||
def update_gradients_full(self, dL_dK, X, X2=None):
|
||||
k = self.K(X,X2)*dL_dK
|
||||
for p in self.parts:
|
||||
p.update_gradients_full(k/p.K(X,X2),X,X2)
|
||||
if len(self.parts)==2:
|
||||
self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
|
||||
self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
|
||||
else:
|
||||
k = self.K(X,X2)*dL_dK
|
||||
for p in self.parts:
|
||||
p.update_gradients_full(k/p.K(X,X2),X,X2)
|
||||
|
||||
def update_gradients_diag(self, dL_dKdiag, X):
|
||||
k = self.Kdiag(X)*dL_dKdiag
|
||||
for p in self.parts:
|
||||
p.update_gradients_diag(k/p.Kdiag(X),X)
|
||||
if len(self.parts)==2:
|
||||
self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
|
||||
self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
|
||||
else:
|
||||
k = self.Kdiag(X)*dL_dKdiag
|
||||
for p in self.parts:
|
||||
p.update_gradients_diag(k/p.Kdiag(X),X)
|
||||
|
||||
def gradients_X(self, dL_dK, X, X2=None):
|
||||
target = np.zeros(X.shape)
|
||||
k = self.K(X,X2)*dL_dK
|
||||
for p in self.parts:
|
||||
target += p.gradients_X(k/p.K(X,X2),X,X2)
|
||||
if len(self.parts)==2:
|
||||
target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
|
||||
target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
|
||||
else:
|
||||
k = self.K(X,X2)*dL_dK
|
||||
for p in self.parts:
|
||||
target += p.gradients_X(k/p.K(X,X2),X,X2)
|
||||
return target
|
||||
|
||||
def gradients_X_diag(self, dL_dKdiag, X):
|
||||
target = np.zeros(X.shape)
|
||||
k = self.Kdiag(X)*dL_dKdiag
|
||||
for p in self.parts:
|
||||
target += p.gradients_X_diag(k/p.Kdiag(X),X)
|
||||
if len(self.parts)==2:
|
||||
target += self.parts[0].gradients_X_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
|
||||
target += self.parts[1].gradients_X_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
|
||||
else:
|
||||
k = self.Kdiag(X)*dL_dKdiag
|
||||
for p in self.parts:
|
||||
target += p.gradients_X_diag(k/p.Kdiag(X),X)
|
||||
return target
|
||||
|
|
|
|||
|
|
@ -77,6 +77,32 @@ class Bernoulli(Likelihood):
|
|||
|
||||
return Z_hat, mu_hat, sigma2_hat
|
||||
|
||||
def variational_expectations(self, Y, m, v, gh_points=None):
|
||||
if isinstance(self.gp_link, link_functions.Probit):
|
||||
|
||||
if gh_points is None:
|
||||
gh_x, gh_w = np.polynomial.hermite.hermgauss(20)
|
||||
else:
|
||||
gh_x, gh_w = gh_points
|
||||
|
||||
from scipy import stats
|
||||
|
||||
shape = m.shape
|
||||
m,v,Y = m.flatten(), v.flatten(), Y.flatten()
|
||||
Ysign = np.where(Y==1,1,-1)
|
||||
X = gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None]
|
||||
p = stats.norm.cdf(X)
|
||||
p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
|
||||
N = stats.norm.pdf(X)
|
||||
F = np.log(p).dot(gh_w)
|
||||
NoverP = N/p
|
||||
dF_dm = (NoverP*Ysign[:,None]).dot(gh_w)
|
||||
dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(gh_w)
|
||||
return F.reshape(*shape), dF_dm.reshape(*shape), dF_dv.reshape(*shape), None
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
def predictive_mean(self, mu, variance, Y_metadata=None):
|
||||
|
||||
if isinstance(self.gp_link, link_functions.Probit):
|
||||
|
|
|
|||
|
|
@ -3,6 +3,7 @@
|
|||
|
||||
import numpy as np
|
||||
from ..core.parameterization.param import Param
|
||||
from ..core.sparse_gp import SparseGP
|
||||
from ..core.gp import GP
|
||||
from ..inference.latent_function_inference import var_dtc
|
||||
from .. import likelihoods
|
||||
|
|
@ -16,14 +17,9 @@ from GPy.inference.optimization.stochastics import SparseGPStochastics,\
|
|||
#SparseGPMissing
|
||||
logger = logging.getLogger("sparse gp")
|
||||
|
||||
class SparseGPMiniBatch(GP):
|
||||
class SparseGPMiniBatch(SparseGP):
|
||||
"""
|
||||
A general purpose Sparse GP model
|
||||
'''
|
||||
Created on 3 Nov 2014
|
||||
|
||||
@author: maxz
|
||||
'''
|
||||
A general purpose Sparse GP model, allowing missing data and stochastics across dimensions.
|
||||
|
||||
This model allows (approximate) inference using variational DTC or FITC
|
||||
(Gaussian likelihoods) as well as non-conjugate sparse methods based on
|
||||
|
|
@ -315,34 +311,3 @@ Created on 3 Nov 2014
|
|||
else:
|
||||
self.posterior, self._log_marginal_likelihood, self.grad_dict, self.full_values, _ = self._inner_parameters_changed(self.kern, self.X, self.Z, self.likelihood, self.Y_normalized, self.Y_metadata)
|
||||
self._outer_values_update(self.full_values)
|
||||
|
||||
def _raw_predict(self, Xnew, full_cov=False, kern=None):
|
||||
"""
|
||||
Make a prediction for the latent function values
|
||||
"""
|
||||
|
||||
if kern is None: kern = self.kern
|
||||
|
||||
if not isinstance(Xnew, VariationalPosterior):
|
||||
Kx = kern.K(self.Z, Xnew)
|
||||
mu = np.dot(Kx.T, self.posterior.woodbury_vector)
|
||||
if full_cov:
|
||||
Kxx = kern.K(Xnew)
|
||||
if self.posterior.woodbury_inv.ndim == 2:
|
||||
var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx))
|
||||
elif self.posterior.woodbury_inv.ndim == 3:
|
||||
var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2)
|
||||
var = var
|
||||
else:
|
||||
Kxx = kern.Kdiag(Xnew)
|
||||
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
|
||||
else:
|
||||
Kx = kern.psi1(self.Z, Xnew)
|
||||
mu = np.dot(Kx, self.posterior.woodbury_vector)
|
||||
if full_cov:
|
||||
raise NotImplementedError, "TODO"
|
||||
else:
|
||||
Kxx = kern.psi0(self.Z, Xnew)
|
||||
psi2 = kern.psi2(self.Z, Xnew)
|
||||
var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
|
||||
return mu, var
|
||||
|
|
|
|||
|
|
@ -138,8 +138,6 @@ class Test(ListDictTestCase):
|
|||
self.assertIsNot(par.gradient_full, pcopy.gradient_full)
|
||||
self.assertTrue(pcopy.checkgrad())
|
||||
self.assert_(np.any(pcopy.gradient!=0.0))
|
||||
pcopy.optimize('bfgs')
|
||||
par.optimize('bfgs')
|
||||
np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=1e-6)
|
||||
par.randomize()
|
||||
with tempfile.TemporaryFile('w+b') as f:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue