mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-08 11:32:39 +02:00
bugfix merge!
This commit is contained in:
commit
4139397dd3
37 changed files with 2415 additions and 1975 deletions
|
|
@ -22,15 +22,44 @@ def extract_properties_to_index(index, props):
|
||||||
|
|
||||||
|
|
||||||
class ParameterIndexOperations(object):
|
class ParameterIndexOperations(object):
|
||||||
'''
|
"""
|
||||||
Index operations for storing param index _properties
|
This object wraps a dictionary, whos keys are _operations_ that we'd like
|
||||||
This class enables index with slices retrieved from object.__getitem__ calls.
|
to apply to a parameter array, and whose values are np integer arrays which
|
||||||
Adding an index will add the selected indexes by the slice of an indexarray
|
index the parameter array appropriately.
|
||||||
indexing a shape shaped array to the flattened index array. Remove will
|
|
||||||
remove the selected slice indices from the flattened array.
|
A model instance will contain one instance of this class for each thing
|
||||||
You can give an offset to set an offset for the given indices in the
|
that needs indexing (i.e. constraints, ties and priors). Parameters within
|
||||||
index array, for multi-param handling.
|
the model constain instances of the ParameterIndexOperationsView class,
|
||||||
'''
|
which can map from a 'local' index (starting 0) to this global index.
|
||||||
|
|
||||||
|
Here's an illustration:
|
||||||
|
|
||||||
|
#=======================================================================
|
||||||
|
model : 0 1 2 3 4 5 6 7 8 9
|
||||||
|
key1: 4 5
|
||||||
|
key2: 7 8
|
||||||
|
|
||||||
|
param1: 0 1 2 3 4 5
|
||||||
|
key1: 2 3
|
||||||
|
key2: 5
|
||||||
|
|
||||||
|
param2: 0 1 2 3 4
|
||||||
|
key1: 0
|
||||||
|
key2: 2 3
|
||||||
|
#=======================================================================
|
||||||
|
|
||||||
|
The views of this global index have a subset of the keys in this global
|
||||||
|
(model) index.
|
||||||
|
|
||||||
|
Adding a new key (e.g. a constraint) to a view will cause the view to pass
|
||||||
|
the new key to the global index, along with the local index and an offset.
|
||||||
|
This global index then stores the key and the appropriate global index
|
||||||
|
(which can be seen by the view).
|
||||||
|
|
||||||
|
See also:
|
||||||
|
ParameterIndexOperationsView
|
||||||
|
|
||||||
|
"""
|
||||||
_offset = 0
|
_offset = 0
|
||||||
def __init__(self, constraints=None):
|
def __init__(self, constraints=None):
|
||||||
self._properties = IntArrayDict()
|
self._properties = IntArrayDict()
|
||||||
|
|
|
||||||
91
GPy/core/parameterization/ties_and_remappings.py
Normal file
91
GPy/core/parameterization/ties_and_remappings.py
Normal file
|
|
@ -0,0 +1,91 @@
|
||||||
|
# Copyright (c) 2014, James Hensman, Max Zwiessele
|
||||||
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from parameterized import Parameterized
|
||||||
|
from param import Param
|
||||||
|
|
||||||
|
class Remapping(Parameterized):
|
||||||
|
def mapping(self):
|
||||||
|
"""
|
||||||
|
The return value of this function gives the values which the re-mapped
|
||||||
|
parameters should take. Implement in sub-classes.
|
||||||
|
"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def callback(self):
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return self.name
|
||||||
|
|
||||||
|
def parameters_changed(self):
|
||||||
|
#ensure all out parameters have the correct value, as specified by our mapping
|
||||||
|
index = self._highest_parent_.constraints[self]
|
||||||
|
self._highest_parent_.param_array[index] = self.mapping()
|
||||||
|
[p.notify_observers(which=self) for p in self.tied_parameters]
|
||||||
|
|
||||||
|
class Fix(Remapping):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Tie(Remapping):
|
||||||
|
def __init__(self, value, name):
|
||||||
|
super(Tie, self).__init__(name)
|
||||||
|
self.tied_parameters = []
|
||||||
|
self.value = Param('val', value)
|
||||||
|
self.add_parameter(self.value)
|
||||||
|
|
||||||
|
def add_tied_parameter(self, p):
|
||||||
|
self.tied_parameters.append(p)
|
||||||
|
p.add_observer(self, self.callback)
|
||||||
|
self.parameters_changed()
|
||||||
|
|
||||||
|
def callback(self, param=None, which=None):
|
||||||
|
"""
|
||||||
|
This gets called whenever any of the tied parameters changes. we spend
|
||||||
|
considerable effort working out what has changed and to what value.
|
||||||
|
Then we store that value in self.value, and broadcast it everywhere
|
||||||
|
with parameters_changed.
|
||||||
|
"""
|
||||||
|
if which is self:return
|
||||||
|
index = self._highest_parent_.constraints[self]
|
||||||
|
if len(index)==0:
|
||||||
|
return # nothing to tie together, this tie exists without any tied parameters
|
||||||
|
self.collate_gradient()
|
||||||
|
vals = self._highest_parent_.param_array[index]
|
||||||
|
uvals = np.unique(vals)
|
||||||
|
if len(uvals)==1:
|
||||||
|
#all of the tied things are at the same value
|
||||||
|
if np.all(self.value==uvals[0]):
|
||||||
|
return # DO NOT DO ANY CHANGES IF THE TIED PART IS NOT CHANGED!
|
||||||
|
self.value[...] = uvals[0]
|
||||||
|
elif len(uvals)==2:
|
||||||
|
#only *one* of the tied things has changed. it must be different to self.value
|
||||||
|
newval = uvals[uvals != self.value*1]
|
||||||
|
self.value[...] = newval
|
||||||
|
else:
|
||||||
|
#more than one of the tied things changed. panic.
|
||||||
|
raise ValueError, "something is wrong with the tieing"
|
||||||
|
def parameters_changed(self):
|
||||||
|
#ensure all out parameters have the correct value, as specified by our mapping
|
||||||
|
index = self._highest_parent_.constraints[self]
|
||||||
|
if np.all(self._highest_parent_.param_array[index]==self.value):
|
||||||
|
return # STOP TRIGGER THE UPDATE LOOP MULTIPLE TIMES!!!
|
||||||
|
self._highest_parent_.param_array[index] = self.mapping()
|
||||||
|
[p.notify_observers(which=self) for p in self.tied_parameters]
|
||||||
|
self.collate_gradient()
|
||||||
|
|
||||||
|
def mapping(self):
|
||||||
|
return self.value
|
||||||
|
|
||||||
|
def collate_gradient(self):
|
||||||
|
index = self._highest_parent_.constraints[self]
|
||||||
|
self.value.gradient = np.sum(self._highest_parent_.gradient[index])
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -195,6 +195,9 @@ class Logistic(Transformation):
|
||||||
self.lower, self.upper = float(lower), float(upper)
|
self.lower, self.upper = float(lower), float(upper)
|
||||||
self.difference = self.upper - self.lower
|
self.difference = self.upper - self.lower
|
||||||
def f(self, x):
|
def f(self, x):
|
||||||
|
if (x<-300.).any():
|
||||||
|
x = x.copy()
|
||||||
|
x[x<-300.] = -300.
|
||||||
return self.lower + self.difference / (1. + np.exp(-x))
|
return self.lower + self.difference / (1. + np.exp(-x))
|
||||||
def finv(self, f):
|
def finv(self, f):
|
||||||
return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf))
|
return np.log(np.clip(f - self.lower, 1e-10, np.inf) / np.clip(self.upper - f, 1e-10, np.inf))
|
||||||
|
|
|
||||||
|
|
@ -34,36 +34,38 @@ class NormalPrior(VariationalPrior):
|
||||||
variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5
|
variational_posterior.variance.gradient -= (1. - (1. / (variational_posterior.variance))) * 0.5
|
||||||
|
|
||||||
class SpikeAndSlabPrior(VariationalPrior):
|
class SpikeAndSlabPrior(VariationalPrior):
|
||||||
def __init__(self, pi, variance = 1.0, name='SpikeAndSlabPrior', **kw):
|
def __init__(self, pi=None, learnPi=False, variance = 1.0, name='SpikeAndSlabPrior', **kw):
|
||||||
super(VariationalPrior, self).__init__(name=name, **kw)
|
super(VariationalPrior, self).__init__(name=name, **kw)
|
||||||
assert variance==1.0, "Not Implemented!"
|
|
||||||
self.pi = Param('pi', pi, Logistic(1e-10,1.-1e-10))
|
self.pi = Param('pi', pi, Logistic(1e-10,1.-1e-10))
|
||||||
self.variance = Param('variance',variance)
|
self.variance = Param('variance',variance)
|
||||||
self.add_parameters(self.pi)
|
self.learnPi = learnPi
|
||||||
self.group_spike_prob = False
|
if learnPi:
|
||||||
|
self.add_parameters(self.pi)
|
||||||
|
|
||||||
def KL_divergence(self, variational_posterior):
|
def KL_divergence(self, variational_posterior):
|
||||||
mu = variational_posterior.mean
|
mu = variational_posterior.mean
|
||||||
S = variational_posterior.variance
|
S = variational_posterior.variance
|
||||||
gamma = variational_posterior.binary_prob
|
gamma = variational_posterior.binary_prob
|
||||||
var_mean = np.square(mu)
|
var_mean = np.square(mu)/self.variance
|
||||||
var_S = (S - np.log(S))
|
var_S = (S/self.variance - np.log(S))
|
||||||
var_gamma = (gamma*np.log(gamma/self.pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-self.pi))).sum()
|
var_gamma = (gamma*np.log(gamma/self.pi)).sum()+((1-gamma)*np.log((1-gamma)/(1-self.pi))).sum()
|
||||||
return var_gamma+ 0.5 * (gamma* (var_mean + var_S -1)).sum()
|
return var_gamma+ (gamma* (np.log(self.variance)-1. +var_mean + var_S)).sum()/2.
|
||||||
|
|
||||||
def update_gradients_KL(self, variational_posterior):
|
def update_gradients_KL(self, variational_posterior):
|
||||||
mu = variational_posterior.mean
|
mu = variational_posterior.mean
|
||||||
S = variational_posterior.variance
|
S = variational_posterior.variance
|
||||||
gamma = variational_posterior.binary_prob
|
gamma = variational_posterior.binary_prob
|
||||||
|
|
||||||
if self.group_spike_prob:
|
gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+((np.square(mu)+S)/self.variance-np.log(S)+np.log(self.variance)-1.)/2.
|
||||||
gamma_grad = np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2.
|
mu.gradient -= gamma*mu/self.variance
|
||||||
gamma.gradient -= gamma_grad.mean(axis=0)
|
S.gradient -= (1./self.variance - 1./S) * gamma /2.
|
||||||
else:
|
if self.learnPi:
|
||||||
gamma.gradient -= np.log((1-self.pi)/self.pi*gamma/(1.-gamma))+(np.square(mu)+S-np.log(S)-1.)/2.
|
if len(self.pi)==1:
|
||||||
mu.gradient -= gamma*mu
|
self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum()
|
||||||
S.gradient -= (1. - (1. / (S))) * gamma /2.
|
elif len(self.pi.shape)==1:
|
||||||
self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0)
|
self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi)).sum(axis=0)
|
||||||
|
else:
|
||||||
|
self.pi.gradient = (gamma/self.pi - (1.-gamma)/(1.-self.pi))
|
||||||
|
|
||||||
class VariationalPosterior(Parameterized):
|
class VariationalPosterior(Parameterized):
|
||||||
def __init__(self, means=None, variances=None, name='latent space', *a, **kw):
|
def __init__(self, means=None, variances=None, name='latent space', *a, **kw):
|
||||||
|
|
@ -155,6 +157,9 @@ class SpikeAndSlabPosterior(VariationalPosterior):
|
||||||
n.parameters[dc['mean']._parent_index_] = dc['mean']
|
n.parameters[dc['mean']._parent_index_] = dc['mean']
|
||||||
n.parameters[dc['variance']._parent_index_] = dc['variance']
|
n.parameters[dc['variance']._parent_index_] = dc['variance']
|
||||||
n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
|
n.parameters[dc['binary_prob']._parent_index_] = dc['binary_prob']
|
||||||
|
n._gradient_array_ = None
|
||||||
|
oversize = self.size - self.mean.size - self.variance.size
|
||||||
|
n.size = n.mean.size + n.variance.size + oversize
|
||||||
n.ndim = n.mean.ndim
|
n.ndim = n.mean.ndim
|
||||||
n.shape = n.mean.shape
|
n.shape = n.mean.shape
|
||||||
n.num_data = n.mean.shape[0]
|
n.num_data = n.mean.shape[0]
|
||||||
|
|
@ -163,7 +168,7 @@ class SpikeAndSlabPosterior(VariationalPosterior):
|
||||||
else:
|
else:
|
||||||
return super(VariationalPrior, self).__getitem__(s)
|
return super(VariationalPrior, self).__getitem__(s)
|
||||||
|
|
||||||
def plot(self, *args):
|
def plot(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
Plot latent space X in 1D:
|
Plot latent space X in 1D:
|
||||||
|
|
||||||
|
|
@ -172,4 +177,4 @@ class SpikeAndSlabPosterior(VariationalPosterior):
|
||||||
import sys
|
import sys
|
||||||
assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
|
assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
|
||||||
from ...plotting.matplot_dep import variational_plots
|
from ...plotting.matplot_dep import variational_plots
|
||||||
return variational_plots.plot_SpikeSlab(self,*args)
|
return variational_plots.plot_SpikeSlab(self,*args, **kwargs)
|
||||||
|
|
|
||||||
|
|
@ -118,5 +118,3 @@ class SparseGP(GP):
|
||||||
psi2 = kern.psi2(self.Z, Xnew)
|
psi2 = kern.psi2(self.Z, Xnew)
|
||||||
var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
|
var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1)
|
||||||
return mu, var
|
return mu, var
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -183,6 +183,33 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40,
|
||||||
plt.close(fig)
|
plt.close(fig)
|
||||||
return m
|
return m
|
||||||
|
|
||||||
|
def ssgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, max_iters=1000, **k):
|
||||||
|
import GPy
|
||||||
|
from matplotlib import pyplot as plt
|
||||||
|
from ..util.misc import param_to_array
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
_np.random.seed(0)
|
||||||
|
data = GPy.util.datasets.oil()
|
||||||
|
|
||||||
|
kernel = GPy.kern.RBF(Q, 1., 1./_np.random.uniform(0,1,(Q,)), ARD=True)# + GPy.kern.Bias(Q, _np.exp(-2))
|
||||||
|
Y = data['X'][:N]
|
||||||
|
m = GPy.models.SSGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k)
|
||||||
|
m.data_labels = data['Y'][:N].argmax(axis=1)
|
||||||
|
|
||||||
|
if optimize:
|
||||||
|
m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05)
|
||||||
|
|
||||||
|
if plot:
|
||||||
|
fig, (latent_axes, sense_axes) = plt.subplots(1, 2)
|
||||||
|
m.plot_latent(ax=latent_axes, labels=m.data_labels)
|
||||||
|
data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:]))
|
||||||
|
lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable
|
||||||
|
m, data_show, latent_axes=latent_axes, sense_axes=sense_axes, labels=m.data_labels)
|
||||||
|
raw_input('Press enter to finish')
|
||||||
|
plt.close(fig)
|
||||||
|
return m
|
||||||
|
|
||||||
def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
|
def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False):
|
||||||
_np.random.seed(1234)
|
_np.random.seed(1234)
|
||||||
|
|
||||||
|
|
@ -288,6 +315,31 @@ def bgplvm_simulation(optimize=True, verbose=1,
|
||||||
m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
|
m.kern.plot_ARD('BGPLVM Simulation ARD Parameters')
|
||||||
return m
|
return m
|
||||||
|
|
||||||
|
def ssgplvm_simulation(optimize=True, verbose=1,
|
||||||
|
plot=True, plot_sim=False,
|
||||||
|
max_iters=2e4, useGPU=False
|
||||||
|
):
|
||||||
|
from GPy import kern
|
||||||
|
from GPy.models import SSGPLVM
|
||||||
|
|
||||||
|
D1, D2, D3, N, num_inducing, Q = 13, 5, 8, 45, 3, 9
|
||||||
|
_, _, Ylist = _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim)
|
||||||
|
Y = Ylist[0]
|
||||||
|
k = kern.Linear(Q, ARD=True, useGPU=useGPU)# + kern.white(Q, _np.exp(-2)) # + kern.bias(Q)
|
||||||
|
#k = kern.RBF(Q, ARD=True, lengthscale=10.)
|
||||||
|
m = SSGPLVM(Y, Q, init="pca", num_inducing=num_inducing, kernel=k)
|
||||||
|
m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape)
|
||||||
|
m.likelihood.variance = .1
|
||||||
|
|
||||||
|
if optimize:
|
||||||
|
print "Optimizing model:"
|
||||||
|
m.optimize('scg', messages=verbose, max_iters=max_iters,
|
||||||
|
gtol=.05)
|
||||||
|
if plot:
|
||||||
|
m.X.plot("SSGPLVM Latent Space 1D")
|
||||||
|
m.kern.plot_ARD('SSGPLVM Simulation ARD Parameters')
|
||||||
|
return m
|
||||||
|
|
||||||
def bgplvm_simulation_missing_data(optimize=True, verbose=1,
|
def bgplvm_simulation_missing_data(optimize=True, verbose=1,
|
||||||
plot=True, plot_sim=False,
|
plot=True, plot_sim=False,
|
||||||
max_iters=2e4,
|
max_iters=2e4,
|
||||||
|
|
|
||||||
|
|
@ -153,9 +153,14 @@ class Posterior(object):
|
||||||
$$
|
$$
|
||||||
"""
|
"""
|
||||||
if self._woodbury_inv is None:
|
if self._woodbury_inv is None:
|
||||||
self._woodbury_inv, _ = dpotri(self.woodbury_chol, lower=1)
|
if self._woodbury_chol is not None:
|
||||||
#self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1)
|
self._woodbury_inv, _ = dpotri(self._woodbury_chol, lower=1)
|
||||||
symmetrify(self._woodbury_inv)
|
#self._woodbury_inv, _ = dpotrs(self.woodbury_chol, np.eye(self.woodbury_chol.shape[0]), lower=1)
|
||||||
|
symmetrify(self._woodbury_inv)
|
||||||
|
elif self._covariance is not None:
|
||||||
|
B = self._K - self._covariance
|
||||||
|
tmp, _ = dpotrs(self.K_chol, B)
|
||||||
|
self._woodbury_inv, _ = dpotrs(self.K_chol, tmp.T)
|
||||||
return self._woodbury_inv
|
return self._woodbury_inv
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
||||||
|
|
@ -64,20 +64,9 @@ class VarDTC(LatentFunctionInference):
|
||||||
return Y * prec # TODO chache this, and make it effective
|
return Y * prec # TODO chache this, and make it effective
|
||||||
|
|
||||||
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
|
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
|
||||||
if isinstance(X, VariationalPosterior):
|
|
||||||
uncertain_inputs = True
|
|
||||||
psi0 = kern.psi0(Z, X)
|
|
||||||
psi1 = kern.psi1(Z, X)
|
|
||||||
psi2 = kern.psi2(Z, X)
|
|
||||||
else:
|
|
||||||
uncertain_inputs = False
|
|
||||||
psi0 = kern.Kdiag(X)
|
|
||||||
psi1 = kern.K(X, Z)
|
|
||||||
psi2 = None
|
|
||||||
|
|
||||||
#see whether we're using variational uncertain inputs
|
|
||||||
|
|
||||||
_, output_dim = Y.shape
|
_, output_dim = Y.shape
|
||||||
|
uncertain_inputs = isinstance(X, VariationalPosterior)
|
||||||
|
|
||||||
#see whether we've got a different noise variance for each datum
|
#see whether we've got a different noise variance for each datum
|
||||||
beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
|
beta = 1./np.fmax(likelihood.gaussian_variance(Y_metadata), 1e-6)
|
||||||
|
|
@ -98,23 +87,21 @@ class VarDTC(LatentFunctionInference):
|
||||||
diag.add(Kmm, self.const_jitter)
|
diag.add(Kmm, self.const_jitter)
|
||||||
Lm = jitchol(Kmm)
|
Lm = jitchol(Kmm)
|
||||||
|
|
||||||
# The rather complex computations of A
|
|
||||||
|
# The rather complex computations of A, and the psi stats
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
|
psi0 = kern.psi0(Z, X)
|
||||||
|
psi1 = kern.psi1(Z, X)
|
||||||
if het_noise:
|
if het_noise:
|
||||||
psi2_beta = psi2 * (beta.flatten().reshape(num_data, 1, 1)).sum(0)
|
psi2_beta = np.sum([kern.psi2(Z,X[i:i+1,:]) * beta_i for i,beta_i in enumerate(beta)],0)
|
||||||
else:
|
else:
|
||||||
psi2_beta = psi2.sum(0) * beta
|
psi2_beta = kern.psi2(Z,X) * beta
|
||||||
#if 0:
|
|
||||||
# evals, evecs = linalg.eigh(psi2_beta)
|
|
||||||
# clipped_evals = np.clip(evals, 0., 1e6) # TODO: make clipping configurable
|
|
||||||
# if not np.array_equal(evals, clipped_evals):
|
|
||||||
# pass # print evals
|
|
||||||
# tmp = evecs * np.sqrt(clipped_evals)
|
|
||||||
# tmp = tmp.T
|
|
||||||
# no backsubstitution because of bound explosion on tr(A) if not...
|
|
||||||
LmInv = dtrtri(Lm)
|
LmInv = dtrtri(Lm)
|
||||||
A = LmInv.dot(psi2_beta.dot(LmInv.T))
|
A = LmInv.dot(psi2_beta.dot(LmInv.T))
|
||||||
else:
|
else:
|
||||||
|
psi0 = kern.Kdiag(X)
|
||||||
|
psi1 = kern.K(X, Z)
|
||||||
|
psi2 = None
|
||||||
if het_noise:
|
if het_noise:
|
||||||
tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
|
tmp = psi1 * (np.sqrt(beta.reshape(num_data, 1)))
|
||||||
else:
|
else:
|
||||||
|
|
@ -151,7 +138,7 @@ class VarDTC(LatentFunctionInference):
|
||||||
log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
|
log_marginal = _compute_log_marginal_likelihood(likelihood, num_data, output_dim, beta, het_noise,
|
||||||
psi0, A, LB, trYYT, data_fit, VVT_factor)
|
psi0, A, LB, trYYT, data_fit, VVT_factor)
|
||||||
|
|
||||||
#put the gradients in the right places
|
#noise derivatives
|
||||||
dL_dR = _compute_dL_dR(likelihood,
|
dL_dR = _compute_dL_dR(likelihood,
|
||||||
het_noise, uncertain_inputs, LB,
|
het_noise, uncertain_inputs, LB,
|
||||||
_LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
|
_LBi_Lmi_psi1Vf, DBi_plus_BiPBi, Lm, A,
|
||||||
|
|
@ -160,6 +147,7 @@ class VarDTC(LatentFunctionInference):
|
||||||
|
|
||||||
dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata)
|
dL_dthetaL = likelihood.exact_inference_gradients(dL_dR,Y_metadata)
|
||||||
|
|
||||||
|
#put the gradients in the right places
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
grad_dict = {'dL_dKmm': dL_dKmm,
|
grad_dict = {'dL_dKmm': dL_dKmm,
|
||||||
'dL_dpsi0':dL_dpsi0,
|
'dL_dpsi0':dL_dpsi0,
|
||||||
|
|
@ -403,10 +391,7 @@ def _compute_dL_dpsi(num_inducing, num_data, output_dim, beta, Lm, VVT_factor, C
|
||||||
dL_dpsi2 = None
|
dL_dpsi2 = None
|
||||||
else:
|
else:
|
||||||
dL_dpsi2 = beta * dL_dpsi2_beta
|
dL_dpsi2 = beta * dL_dpsi2_beta
|
||||||
if uncertain_inputs:
|
if not uncertain_inputs:
|
||||||
# repeat for each of the N psi_2 matrices
|
|
||||||
dL_dpsi2 = np.repeat(dL_dpsi2[None, :, :], num_data, axis=0)
|
|
||||||
else:
|
|
||||||
# subsume back into psi1 (==Kmn)
|
# subsume back into psi1 (==Kmn)
|
||||||
dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
|
dL_dpsi1 += 2.*np.dot(psi1, dL_dpsi2)
|
||||||
dL_dpsi2 = None
|
dL_dpsi2 = None
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ try:
|
||||||
import scikits.cuda.linalg as culinalg
|
import scikits.cuda.linalg as culinalg
|
||||||
import pycuda.gpuarray as gpuarray
|
import pycuda.gpuarray as gpuarray
|
||||||
from scikits.cuda import cublas
|
from scikits.cuda import cublas
|
||||||
from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod
|
from ...util.linalg_gpu import logDiagSum, strideSum, mul_bcast, sum_axis, outer_prod, mul_bcast_first, join_prod, traceDot
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
@ -66,18 +66,13 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'),
|
'beta_gpu' :gpuarray.empty((ndata,),np.float64,order='F'),
|
||||||
'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN
|
'YT_gpu' :gpuarray.to_gpu(np.asfortranarray(Y.T)), # DxN
|
||||||
'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN
|
'betaYT_gpu' :gpuarray.empty(Y.T.shape,np.float64,order='F'), # DxN
|
||||||
'psi2_t_gpu' :gpuarray.empty((num_inducing*num_inducing*self.batchsize),np.float64,order='F'),
|
|
||||||
# inference_minibatch
|
# inference_minibatch
|
||||||
'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
|
'dL_dpsi0_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
|
||||||
'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
|
'dL_dpsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
|
||||||
'dL_dpsi2_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'),
|
'dL_dpsi2_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
|
||||||
'dL_dthetaL_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
|
|
||||||
'betapsi1_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
|
|
||||||
'thetaL_t_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
|
|
||||||
'betaYT2_gpu' :gpuarray.empty((output_dim,self.batchsize),np.float64,order='F'),
|
|
||||||
'psi0p_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
|
'psi0p_gpu' :gpuarray.empty((self.batchsize,),np.float64,order='F'),
|
||||||
'psi1p_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
|
'psi1p_gpu' :gpuarray.empty((self.batchsize,num_inducing),np.float64,order='F'),
|
||||||
'psi2p_gpu' :gpuarray.empty((self.batchsize,num_inducing,num_inducing),np.float64,order='F'),
|
'psi2p_gpu' :gpuarray.empty((num_inducing,num_inducing),np.float64,order='F'),
|
||||||
}
|
}
|
||||||
self.gpuCache['ones_gpu'].fill(1.0)
|
self.gpuCache['ones_gpu'].fill(1.0)
|
||||||
|
|
||||||
|
|
@ -126,6 +121,89 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
else:
|
else:
|
||||||
return jitchol(tdot(Y))
|
return jitchol(tdot(Y))
|
||||||
|
|
||||||
|
def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs, het_noise):
|
||||||
|
num_inducing, input_dim = Z.shape[0], Z.shape[1]
|
||||||
|
num_data, output_dim = Y.shape
|
||||||
|
trYYT = self._trYYT
|
||||||
|
psi1Y_gpu = self.gpuCache['psi1Y_gpu']
|
||||||
|
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||||
|
beta_gpu = self.gpuCache['beta_gpu']
|
||||||
|
YT_gpu = self.gpuCache['YT_gpu']
|
||||||
|
betaYT_gpu = self.gpuCache['betaYT_gpu']
|
||||||
|
|
||||||
|
beta_gpu.fill(beta)
|
||||||
|
betaYT_gpu.fill(0.)
|
||||||
|
cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1)
|
||||||
|
YRY_full = trYYT*beta
|
||||||
|
|
||||||
|
if kern.useGPU:
|
||||||
|
psi1Y_gpu.fill(0.)
|
||||||
|
psi2_gpu.fill(0.)
|
||||||
|
psi0_full = 0
|
||||||
|
|
||||||
|
for n_start in xrange(0,num_data,self.batchsize):
|
||||||
|
n_end = min(self.batchsize+n_start, num_data)
|
||||||
|
ndata = n_end - n_start
|
||||||
|
X_slice = X[n_start:n_end]
|
||||||
|
betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
|
||||||
|
|
||||||
|
if uncertain_inputs:
|
||||||
|
psi0 = kern.psi0(Z, X_slice)
|
||||||
|
psi1p_gpu = kern.psi1(Z, X_slice)
|
||||||
|
psi2p_gpu = kern.psi2(Z, X_slice)
|
||||||
|
else:
|
||||||
|
psi0 = kern.Kdiag(X_slice)
|
||||||
|
psi1p_gpu = kern.K(X_slice, Z)
|
||||||
|
|
||||||
|
cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing)
|
||||||
|
|
||||||
|
psi0_full += psi0.sum()
|
||||||
|
|
||||||
|
if uncertain_inputs:
|
||||||
|
sum_axis(psi2_gpu,psi2p_gpu,1,1)
|
||||||
|
else:
|
||||||
|
cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing)
|
||||||
|
|
||||||
|
psi0_full *= beta
|
||||||
|
if uncertain_inputs:
|
||||||
|
cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1)
|
||||||
|
|
||||||
|
else:
|
||||||
|
psi2_full = np.zeros((num_inducing,num_inducing))
|
||||||
|
psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
|
||||||
|
psi0_full = 0.
|
||||||
|
YRY_full = 0.
|
||||||
|
|
||||||
|
for n_start in xrange(0,num_data,self.batchsize):
|
||||||
|
n_end = min(self.batchsize+n_start, num_data)
|
||||||
|
Y_slice = Y[n_start:n_end]
|
||||||
|
X_slice = X[n_start:n_end]
|
||||||
|
|
||||||
|
if het_noise:
|
||||||
|
b = beta[n_start]
|
||||||
|
YRY_full += np.inner(Y_slice, Y_slice)*b
|
||||||
|
else:
|
||||||
|
b = beta
|
||||||
|
|
||||||
|
if uncertain_inputs:
|
||||||
|
psi0 = kern.psi0(Z, X_slice)
|
||||||
|
psi1 = kern.psi1(Z, X_slice)
|
||||||
|
psi2_full += kern.psi2(Z, X_slice)*b
|
||||||
|
else:
|
||||||
|
psi0 = kern.Kdiag(X_slice)
|
||||||
|
psi1 = kern.K(X_slice, Z)
|
||||||
|
psi2_full += np.dot(psi1.T,psi1)*b
|
||||||
|
|
||||||
|
psi0_full += psi0.sum()*b
|
||||||
|
psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM
|
||||||
|
|
||||||
|
if not het_noise:
|
||||||
|
YRY_full = trYYT*beta
|
||||||
|
psi1Y_gpu.set(psi1Y_full)
|
||||||
|
psi2_gpu.set(psi2_full)
|
||||||
|
|
||||||
|
return psi0_full, YRY_full
|
||||||
|
|
||||||
def inference_likelihood(self, kern, X, Z, likelihood, Y):
|
def inference_likelihood(self, kern, X, Z, likelihood, Y):
|
||||||
"""
|
"""
|
||||||
The first phase of inference:
|
The first phase of inference:
|
||||||
|
|
@ -137,6 +215,12 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
num_inducing, input_dim = Z.shape[0], Z.shape[1]
|
num_inducing, input_dim = Z.shape[0], Z.shape[1]
|
||||||
num_data, output_dim = Y.shape
|
num_data, output_dim = Y.shape
|
||||||
|
|
||||||
|
#see whether we've got a different noise variance for each datum
|
||||||
|
beta = 1./np.fmax(likelihood.variance, 1e-6)
|
||||||
|
het_noise = beta.size > 1
|
||||||
|
if het_noise:
|
||||||
|
self.batchsize=0
|
||||||
|
|
||||||
self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y)
|
self._initGPUCache(kern, num_inducing, input_dim, output_dim, Y)
|
||||||
|
|
||||||
if isinstance(X, VariationalPosterior):
|
if isinstance(X, VariationalPosterior):
|
||||||
|
|
@ -144,123 +228,10 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
else:
|
else:
|
||||||
uncertain_inputs = False
|
uncertain_inputs = False
|
||||||
|
|
||||||
#see whether we've got a different noise variance for each datum
|
|
||||||
beta = 1./np.fmax(likelihood.variance, 1e-6)
|
|
||||||
het_noise = beta.size > 1
|
|
||||||
trYYT = self._trYYT
|
|
||||||
|
|
||||||
psi1Y_gpu = self.gpuCache['psi1Y_gpu']
|
psi1Y_gpu = self.gpuCache['psi1Y_gpu']
|
||||||
psi2_gpu = self.gpuCache['psi2_gpu']
|
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||||
beta_gpu = self.gpuCache['beta_gpu']
|
|
||||||
YT_gpu = self.gpuCache['YT_gpu']
|
|
||||||
betaYT_gpu = self.gpuCache['betaYT_gpu']
|
|
||||||
psi2_t_gpu = self.gpuCache['psi2_t_gpu']
|
|
||||||
|
|
||||||
if het_noise:
|
psi0_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs, het_noise)
|
||||||
beta_gpu.set(np.asfortranarray(beta))
|
|
||||||
mul_bcast(betaYT_gpu,beta_gpu,YT_gpu,beta_gpu.size)
|
|
||||||
YRY_full = cublas.cublasDdot(self.cublas_handle, YT_gpu.size, betaYT_gpu.gpudata, 1, YT_gpu.gpudata, 1)
|
|
||||||
else:
|
|
||||||
beta_gpu.fill(beta)
|
|
||||||
betaYT_gpu.fill(0.)
|
|
||||||
cublas.cublasDaxpy(self.cublas_handle, betaYT_gpu.size, beta, YT_gpu.gpudata, 1, betaYT_gpu.gpudata, 1)
|
|
||||||
YRY_full = trYYT*beta
|
|
||||||
|
|
||||||
if kern.useGPU:
|
|
||||||
psi1Y_gpu.fill(0.)
|
|
||||||
psi2_gpu.fill(0.)
|
|
||||||
psi0_full = 0
|
|
||||||
|
|
||||||
for n_start in xrange(0,num_data,self.batchsize):
|
|
||||||
n_end = min(self.batchsize+n_start, num_data)
|
|
||||||
ndata = n_end - n_start
|
|
||||||
X_slice = X[n_start:n_end]
|
|
||||||
beta_gpu_slice = beta_gpu[n_start:n_end]
|
|
||||||
betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
|
|
||||||
if ndata==self.batchsize:
|
|
||||||
psi2_t_gpu_slice = psi2_t_gpu
|
|
||||||
else:
|
|
||||||
psi2_t_gpu_slice = psi2_t_gpu[:num_inducing*num_inducing*ndata]
|
|
||||||
if uncertain_inputs:
|
|
||||||
psi0p_gpu = kern.psi0(Z, X_slice)
|
|
||||||
psi1p_gpu = kern.psi1(Z, X_slice)
|
|
||||||
psi2p_gpu = kern.psi2(Z, X_slice)
|
|
||||||
else:
|
|
||||||
psi0p_gpu = kern.Kdiag(X_slice)
|
|
||||||
psi1p_gpu = kern.K(X_slice, Z)
|
|
||||||
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'T', num_inducing, output_dim, ndata, 1.0, psi1p_gpu.gpudata, ndata, betaYT_gpu_slice.gpudata, output_dim, 1.0, psi1Y_gpu.gpudata, num_inducing)
|
|
||||||
|
|
||||||
if het_noise:
|
|
||||||
psi0_full += cublas.cublasDdot(self.cublas_handle, psi0p_gpu.size, beta_gpu_slice.gpudata, 1, psi0p_gpu.gpudata, 1)
|
|
||||||
else:
|
|
||||||
psi0_full += gpuarray.sum(psi0p_gpu).get()
|
|
||||||
|
|
||||||
if uncertain_inputs:
|
|
||||||
if het_noise:
|
|
||||||
mul_bcast(psi2_t_gpu_slice,beta_gpu_slice,psi2p_gpu,beta_gpu_slice.size)
|
|
||||||
sum_axis(psi2_gpu,psi2_t_gpu_slice,1,ndata)
|
|
||||||
else:
|
|
||||||
sum_axis(psi2_gpu,psi2p_gpu,1,ndata)
|
|
||||||
else:
|
|
||||||
if het_noise:
|
|
||||||
psi1_t_gpu = psi2_t_gpu_slice[:,num_inducing*ndata]
|
|
||||||
mul_bcast(psi1_t_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size)
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, 1.0, psi1p_gpu.gpudata, ndata, psi1_t_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing)
|
|
||||||
else:
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'N', num_inducing, num_inducing, ndata, beta, psi1p_gpu.gpudata, ndata, psi1p_gpu.gpudata, ndata, 1.0, psi2_gpu.gpudata, num_inducing)
|
|
||||||
|
|
||||||
if not het_noise:
|
|
||||||
psi0_full *= beta
|
|
||||||
if uncertain_inputs:
|
|
||||||
cublas.cublasDscal(self.cublas_handle, psi2_gpu.size, beta, psi2_gpu.gpudata, 1)
|
|
||||||
|
|
||||||
else:
|
|
||||||
psi2_full = np.zeros((num_inducing,num_inducing),order='F')
|
|
||||||
psi1Y_full = np.zeros((num_inducing,output_dim),order='F') # MxD
|
|
||||||
psi0_full = 0
|
|
||||||
# YRY_full = 0
|
|
||||||
|
|
||||||
for n_start in xrange(0,num_data,self.batchsize):
|
|
||||||
n_end = min(self.batchsize+n_start, num_data)
|
|
||||||
Y_slice = Y[n_start:n_end]
|
|
||||||
X_slice = X[n_start:n_end]
|
|
||||||
if uncertain_inputs:
|
|
||||||
psi0 = kern.psi0(Z, X_slice)
|
|
||||||
psi1 = kern.psi1(Z, X_slice)
|
|
||||||
psi2 = kern.psi2(Z, X_slice)
|
|
||||||
else:
|
|
||||||
psi0 = kern.Kdiag(X_slice)
|
|
||||||
psi1 = kern.K(X_slice, Z)
|
|
||||||
|
|
||||||
if het_noise:
|
|
||||||
beta_slice = beta[n_start:n_end]
|
|
||||||
psi0_full += (beta_slice*psi0).sum()
|
|
||||||
psi1Y_full += np.dot(psi1.T,beta_slice[:,None]*Y_slice) # MxD
|
|
||||||
# YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum()
|
|
||||||
else:
|
|
||||||
psi0_full += psi0.sum()
|
|
||||||
psi1Y_full += np.dot(psi1.T,Y_slice) # MxD
|
|
||||||
|
|
||||||
if uncertain_inputs:
|
|
||||||
if het_noise:
|
|
||||||
psi2_full += np.einsum('n,nmo->mo',beta_slice,psi2)
|
|
||||||
else:
|
|
||||||
psi2_full += psi2.sum(axis=0)
|
|
||||||
else:
|
|
||||||
if het_noise:
|
|
||||||
psi2_full += np.einsum('n,nm,no->mo',beta_slice,psi1,psi1)
|
|
||||||
else:
|
|
||||||
psi2_full += tdot(psi1.T)
|
|
||||||
|
|
||||||
if not het_noise:
|
|
||||||
psi0_full *= beta
|
|
||||||
psi1Y_full *= beta
|
|
||||||
psi2_full *= beta
|
|
||||||
# YRY_full = trYYT*beta
|
|
||||||
|
|
||||||
psi1Y_gpu.set(psi1Y_full)
|
|
||||||
psi2_gpu.set(psi2_full)
|
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute Common Components
|
# Compute Common Components
|
||||||
|
|
@ -373,6 +344,16 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
|
|
||||||
post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get())
|
post = Posterior(woodbury_inv=KmmInvPsi2P_gpu.get(), woodbury_vector=v_gpu.get(), K=Kmm_gpu.get(), mean=None, cov=None, K_chol=Lm_gpu.get())
|
||||||
|
|
||||||
|
#======================================================================
|
||||||
|
# Compute dL_dthetaL for uncertian input and non-heter noise
|
||||||
|
#======================================================================
|
||||||
|
|
||||||
|
if not het_noise:
|
||||||
|
dL_dthetaL = (YRY_full + output_dim*psi0_full - num_data*output_dim)/-2.
|
||||||
|
dL_dthetaL += cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2_gpu.gpudata,1)
|
||||||
|
dL_dthetaL += cublas.cublasDdot(self.cublas_handle,v_gpu.size, v_gpu.gpudata,1,psi1Y_gpu.gpudata,1)
|
||||||
|
self.midRes['dL_dthetaL'] = -beta*dL_dthetaL
|
||||||
|
|
||||||
return logL, dL_dKmm_gpu.get(), post
|
return logL, dL_dKmm_gpu.get(), post
|
||||||
|
|
||||||
def inference_minibatch(self, kern, X, Z, likelihood, Y):
|
def inference_minibatch(self, kern, X, Z, likelihood, Y):
|
||||||
|
|
@ -404,26 +385,26 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
|
|
||||||
nSlice = n_end-n_start
|
nSlice = n_end-n_start
|
||||||
X_slice = X[n_start:n_end]
|
X_slice = X[n_start:n_end]
|
||||||
|
if het_noise:
|
||||||
|
beta = beta[n_start] # nSlice==1
|
||||||
|
|
||||||
if kern.useGPU:
|
if kern.useGPU:
|
||||||
if uncertain_inputs:
|
if not uncertain_inputs:
|
||||||
psi0p_gpu = kern.psi0(Z, X_slice)
|
|
||||||
psi1p_gpu = kern.psi1(Z, X_slice)
|
|
||||||
psi2p_gpu = kern.psi2(Z, X_slice)
|
|
||||||
else:
|
|
||||||
psi0p_gpu = kern.Kdiag(X_slice)
|
psi0p_gpu = kern.Kdiag(X_slice)
|
||||||
psi1p_gpu = kern.K(X_slice, Z)
|
psi1p_gpu = kern.K(X_slice, Z)
|
||||||
psi2p_gpu = self.gpuCache['psi2p_gpu']
|
psi2p_gpu = self.gpuCache['psi2p_gpu']
|
||||||
if psi2p_gpu.shape[0] > nSlice:
|
elif het_noise:
|
||||||
psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
|
psi0p_gpu = kern.psi0(Z, X_slice)
|
||||||
else:
|
psi1p_gpu = kern.psi1(Z, X_slice)
|
||||||
if uncertain_inputs:
|
psi2p_gpu = kern.psi2(Z, X_slice)
|
||||||
|
elif not uncertain_inputs or het_noise:
|
||||||
|
if not uncertain_inputs:
|
||||||
|
psi0 = kern.Kdiag(X_slice)
|
||||||
|
psi1 = kern.K(X_slice, Z)
|
||||||
|
elif het_noise:
|
||||||
psi0 = kern.psi0(Z, X_slice)
|
psi0 = kern.psi0(Z, X_slice)
|
||||||
psi1 = kern.psi1(Z, X_slice)
|
psi1 = kern.psi1(Z, X_slice)
|
||||||
psi2 = kern.psi2(Z, X_slice)
|
psi2 = kern.psi2(Z, X_slice)
|
||||||
else:
|
|
||||||
psi0 = kern.Kdiag(X_slice)
|
|
||||||
psi1 = kern.K(X_slice, Z)
|
|
||||||
|
|
||||||
psi0p_gpu = self.gpuCache['psi0p_gpu']
|
psi0p_gpu = self.gpuCache['psi0p_gpu']
|
||||||
psi1p_gpu = self.gpuCache['psi1p_gpu']
|
psi1p_gpu = self.gpuCache['psi1p_gpu']
|
||||||
|
|
@ -431,91 +412,46 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
if psi0p_gpu.shape[0] > nSlice:
|
if psi0p_gpu.shape[0] > nSlice:
|
||||||
psi0p_gpu = psi0p_gpu[:nSlice]
|
psi0p_gpu = psi0p_gpu[:nSlice]
|
||||||
psi1p_gpu = psi1p_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
|
psi1p_gpu = psi1p_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
|
||||||
psi2p_gpu = psi2p_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
|
|
||||||
psi0p_gpu.set(np.asfortranarray(psi0))
|
psi0p_gpu.set(np.asfortranarray(psi0))
|
||||||
psi1p_gpu.set(np.asfortranarray(psi1))
|
psi1p_gpu.set(np.asfortranarray(psi1))
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
psi2p_gpu.set(np.asfortranarray(psi2))
|
psi2p_gpu.set(np.asfortranarray(psi2))
|
||||||
|
|
||||||
#======================================================================
|
|
||||||
# Prepare gpu memory
|
|
||||||
#======================================================================
|
|
||||||
|
|
||||||
dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu']
|
|
||||||
v_gpu = self.gpuCache['v_gpu']
|
|
||||||
betaYT_gpu = self.gpuCache['betaYT_gpu']
|
|
||||||
beta_gpu = self.gpuCache['beta_gpu']
|
|
||||||
dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu']
|
|
||||||
dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
|
|
||||||
dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
|
|
||||||
dL_dthetaL_gpu = self.gpuCache['dL_dthetaL_gpu']
|
|
||||||
psi2R_gpu = self.gpuCache['psi2_t_gpu'][:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
|
|
||||||
betapsi1_gpu = self.gpuCache['betapsi1_gpu']
|
|
||||||
thetaL_t_gpu = self.gpuCache['thetaL_t_gpu']
|
|
||||||
betaYT2_gpu = self.gpuCache['betaYT2_gpu']
|
|
||||||
|
|
||||||
betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
|
|
||||||
beta_gpu_slice = beta_gpu[n_start:n_end]
|
|
||||||
|
|
||||||
# Adjust to the batch size
|
|
||||||
if dL_dpsi0_gpu.shape[0] > nSlice:
|
|
||||||
betaYT2_gpu = betaYT2_gpu[:,:nSlice]
|
|
||||||
dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice]
|
|
||||||
dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
|
|
||||||
dL_dpsi2_gpu = dL_dpsi2_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
|
|
||||||
dL_dthetaL_gpu = dL_dthetaL_gpu.ravel()[:nSlice]
|
|
||||||
psi2R_gpu = psi2R_gpu.ravel()[:nSlice*num_inducing*num_inducing].reshape(nSlice,num_inducing,num_inducing)
|
|
||||||
thetaL_t_gpu = thetaL_t_gpu.ravel()[:nSlice]
|
|
||||||
betapsi1_gpu = betapsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
|
|
||||||
|
|
||||||
mul_bcast(betapsi1_gpu,beta_gpu_slice,psi1p_gpu,beta_gpu_slice.size)
|
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute dL_dpsi
|
# Compute dL_dpsi
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
||||||
|
dL_dpsi2R_gpu = self.gpuCache['dL_dpsi2R_gpu']
|
||||||
|
v_gpu = self.gpuCache['v_gpu']
|
||||||
|
dL_dpsi0_gpu = self.gpuCache['dL_dpsi0_gpu']
|
||||||
|
dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
|
||||||
|
dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
|
||||||
|
betaYT_gpu = self.gpuCache['betaYT_gpu']
|
||||||
|
betaYT_gpu_slice = betaYT_gpu[:,n_start:n_end]
|
||||||
|
|
||||||
dL_dpsi0_gpu.fill(0.)
|
# Adjust to the batch size
|
||||||
cublas.cublasDaxpy(self.cublas_handle, dL_dpsi0_gpu.size, output_dim/(-2.), beta_gpu_slice.gpudata, 1, dL_dpsi0_gpu.gpudata, 1)
|
if dL_dpsi0_gpu.shape[0] > nSlice:
|
||||||
|
dL_dpsi0_gpu = dL_dpsi0_gpu.ravel()[:nSlice]
|
||||||
|
dL_dpsi1_gpu = dL_dpsi1_gpu.ravel()[:nSlice*num_inducing].reshape(nSlice,num_inducing)
|
||||||
|
|
||||||
|
dL_dpsi0_gpu.fill(-output_dim *beta/2.)
|
||||||
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'T', nSlice, num_inducing, output_dim, 1.0, betaYT_gpu_slice.gpudata, output_dim, v_gpu.gpudata, num_inducing, 0., dL_dpsi1_gpu.gpudata, nSlice)
|
cublas.cublasDgemm(self.cublas_handle, 'T', 'T', nSlice, num_inducing, output_dim, 1.0, betaYT_gpu_slice.gpudata, output_dim, v_gpu.gpudata, num_inducing, 0., dL_dpsi1_gpu.gpudata, nSlice)
|
||||||
|
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
outer_prod(dL_dpsi2_gpu,beta_gpu_slice,dL_dpsi2R_gpu,beta_gpu_slice.size)
|
cublas.cublasDcopy(self.cublas_handle, dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata, 1, dL_dpsi2_gpu.gpudata, 1)
|
||||||
|
cublas.cublasDscal(self.cublas_handle, dL_dpsi2_gpu.size, beta, dL_dpsi2_gpu.gpudata, 1)
|
||||||
else:
|
else:
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, 1.0, betapsi1_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice)
|
cublas.cublasDgemm(self.cublas_handle, 'N', 'N', nSlice, num_inducing, output_dim, beta, psi1p_gpu.gpudata, nSlice, dL_dpsi2R_gpu.gpudata, num_inducing, 1.0, dL_dpsi1_gpu.gpudata, nSlice)
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute dL_dthetaL
|
# Compute dL_dthetaL
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
if het_noise:
|
||||||
if not uncertain_inputs:
|
betaY = betaYT_gpu_slice.get()
|
||||||
join_prod(psi2p_gpu,psi1p_gpu,psi1p_gpu,nSlice,num_inducing)
|
dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0p_gpu.get())-output_dim*beta)/2.
|
||||||
|
dL_dthetaL += -beta*beta*cublas.cublasDdot(self.cublas_handle,dL_dpsi2R_gpu.size, dL_dpsi2R_gpu.gpudata,1,psi2p_gpu.gpudata,1)
|
||||||
mul_bcast_first(psi2R_gpu,dL_dpsi2R_gpu,psi2p_gpu,nSlice)
|
dL_dthetaL += -beta*(betaY*np.dot(psi1p_gpu.get(),v_gpu.get())).sum(axis=-1)
|
||||||
|
|
||||||
|
|
||||||
dL_dthetaL_gpu.fill(0.)
|
|
||||||
|
|
||||||
cublas.cublasDcopy(self.cublas_handle, betaYT_gpu_slice.size, betaYT_gpu_slice.gpudata, 1, betaYT2_gpu.gpudata, 1)
|
|
||||||
mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT2_gpu,betaYT2_gpu.size)
|
|
||||||
cublas.cublasDscal(self.cublas_handle, betaYT2_gpu.size, 0.5, betaYT2_gpu.gpudata, 1)
|
|
||||||
sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
|
|
||||||
|
|
||||||
cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/(-2.0), beta_gpu_slice.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
|
|
||||||
cublas.cublasDcopy(self.cublas_handle, beta_gpu_slice.size, beta_gpu_slice.gpudata, 1, thetaL_t_gpu.gpudata, 1)
|
|
||||||
mul_bcast(thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu,thetaL_t_gpu.size)
|
|
||||||
mul_bcast(thetaL_t_gpu,thetaL_t_gpu,psi0p_gpu,thetaL_t_gpu.size)
|
|
||||||
cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, output_dim/2.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
|
|
||||||
|
|
||||||
thetaL_t_gpu.fill(0.)
|
|
||||||
sum_axis(thetaL_t_gpu, psi2R_gpu, nSlice, num_inducing*num_inducing)
|
|
||||||
mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
|
|
||||||
mul_bcast(thetaL_t_gpu,thetaL_t_gpu,beta_gpu_slice,thetaL_t_gpu.size)
|
|
||||||
cublas.cublasDaxpy(self.cublas_handle, dL_dthetaL_gpu.size, -1.0, thetaL_t_gpu.gpudata, 1, dL_dthetaL_gpu.gpudata, 1)
|
|
||||||
|
|
||||||
cublas.cublasDgemm(self.cublas_handle, 'T', 'T', output_dim, nSlice, num_inducing, -1.0, v_gpu.gpudata, num_inducing, betapsi1_gpu.gpudata, nSlice, 0.0, betaYT2_gpu.gpudata, output_dim)
|
|
||||||
mul_bcast(betaYT2_gpu,betaYT2_gpu,betaYT_gpu_slice,betaYT2_gpu.size)
|
|
||||||
sum_axis(dL_dthetaL_gpu, betaYT2_gpu, 1, output_dim)
|
|
||||||
|
|
||||||
if kern.useGPU:
|
if kern.useGPU:
|
||||||
dL_dpsi0 = dL_dpsi0_gpu
|
dL_dpsi0 = dL_dpsi0_gpu
|
||||||
|
|
@ -528,10 +464,11 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
dL_dpsi2 = dL_dpsi2_gpu
|
dL_dpsi2 = dL_dpsi2_gpu
|
||||||
else:
|
else:
|
||||||
dL_dpsi2 = dL_dpsi2_gpu.get()
|
dL_dpsi2 = dL_dpsi2_gpu.get()
|
||||||
if het_noise:
|
if not het_noise:
|
||||||
dL_dthetaL = dL_dthetaL_gpu.get()
|
if isEnd:
|
||||||
else:
|
dL_dthetaL = self.midRes['dL_dthetaL']
|
||||||
dL_dthetaL = gpuarray.sum(dL_dthetaL_gpu).get()
|
else:
|
||||||
|
dL_dthetaL = 0.
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
grad_dict = {'dL_dpsi0':dL_dpsi0,
|
grad_dict = {'dL_dpsi0':dL_dpsi0,
|
||||||
'dL_dpsi1':dL_dpsi1,
|
'dL_dpsi1':dL_dpsi1,
|
||||||
|
|
@ -541,6 +478,6 @@ class VarDTC_GPU(LatentFunctionInference):
|
||||||
grad_dict = {'dL_dKdiag':dL_dpsi0,
|
grad_dict = {'dL_dKdiag':dL_dpsi0,
|
||||||
'dL_dKnm':dL_dpsi1,
|
'dL_dKnm':dL_dpsi1,
|
||||||
'dL_dthetaL':dL_dthetaL}
|
'dL_dthetaL':dL_dthetaL}
|
||||||
|
|
||||||
return isEnd, (n_start,n_end), grad_dict
|
return isEnd, (n_start,n_end), grad_dict
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,11 @@ from ...util.misc import param_to_array
|
||||||
from . import LatentFunctionInference
|
from . import LatentFunctionInference
|
||||||
log_2_pi = np.log(2*np.pi)
|
log_2_pi = np.log(2*np.pi)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
class VarDTC_minibatch(LatentFunctionInference):
|
class VarDTC_minibatch(LatentFunctionInference):
|
||||||
"""
|
"""
|
||||||
An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
|
An object for inference when the likelihood is Gaussian, but we want to do sparse inference.
|
||||||
|
|
@ -21,22 +26,39 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
const_jitter = 1e-6
|
const_jitter = 1e-6
|
||||||
def __init__(self, batchsize, limit=1):
|
def __init__(self, batchsize=None, limit=1, mpi_comm=None):
|
||||||
|
|
||||||
self.batchsize = batchsize
|
self.batchsize = batchsize
|
||||||
|
self.mpi_comm = mpi_comm
|
||||||
|
self.limit = limit
|
||||||
|
|
||||||
# Cache functions
|
# Cache functions
|
||||||
from ...util.caching import Cacher
|
from ...util.caching import Cacher
|
||||||
self.get_trYYT = Cacher(self._get_trYYT, limit)
|
self.get_trYYT = Cacher(self._get_trYYT, limit)
|
||||||
self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
|
self.get_YYTfactor = Cacher(self._get_YYTfactor, limit)
|
||||||
|
|
||||||
self.midRes = {}
|
self.midRes = {}
|
||||||
self.batch_pos = 0 # the starting position of the current mini-batch
|
self.batch_pos = 0 # the starting position of the current mini-batch
|
||||||
|
self.Y_speedup = False # Replace Y with the cholesky factor of YY.T, but the posterior inference will be wrong
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
# has to be overridden, as Cacher objects cannot be pickled.
|
||||||
|
return self.batchsize, self.limit, self.Y_speedup
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
# has to be overridden, as Cacher objects cannot be pickled.
|
||||||
|
self.batchsize, self.limit, self.Y_speedup = state
|
||||||
|
self.mpi_comm = None
|
||||||
|
self.midRes = {}
|
||||||
|
self.batch_pos = 0
|
||||||
|
from ...util.caching import Cacher
|
||||||
|
self.get_trYYT = Cacher(self._get_trYYT, self.limit)
|
||||||
|
self.get_YYTfactor = Cacher(self._get_YYTfactor, self.limit)
|
||||||
|
|
||||||
def set_limit(self, limit):
|
def set_limit(self, limit):
|
||||||
self.get_trYYT.limit = limit
|
self.get_trYYT.limit = limit
|
||||||
self.get_YYTfactor.limit = limit
|
self.get_YYTfactor.limit = limit
|
||||||
|
|
||||||
def _get_trYYT(self, Y):
|
def _get_trYYT(self, Y):
|
||||||
return param_to_array(np.sum(np.square(Y)))
|
return param_to_array(np.sum(np.square(Y)))
|
||||||
|
|
||||||
|
|
@ -51,88 +73,103 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
return param_to_array(Y)
|
return param_to_array(Y)
|
||||||
else:
|
else:
|
||||||
return jitchol(tdot(Y))
|
return jitchol(tdot(Y))
|
||||||
|
|
||||||
|
def gatherPsiStat(self, kern, X, Z, Y, beta, uncertain_inputs):
|
||||||
|
|
||||||
|
het_noise = beta.size > 1
|
||||||
|
|
||||||
|
trYYT = self.get_trYYT(Y)
|
||||||
|
if self.Y_speedup and not het_noise:
|
||||||
|
Y = self.get_YYTfactor(Y)
|
||||||
|
|
||||||
|
num_inducing = Z.shape[0]
|
||||||
|
num_data, output_dim = Y.shape
|
||||||
|
if self.batchsize == None:
|
||||||
|
self.batchsize = num_data
|
||||||
|
|
||||||
|
psi2_full = np.zeros((num_inducing,num_inducing))
|
||||||
|
psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
|
||||||
|
psi0_full = 0.
|
||||||
|
YRY_full = 0.
|
||||||
|
|
||||||
|
for n_start in xrange(0,num_data,self.batchsize):
|
||||||
|
n_end = min(self.batchsize+n_start, num_data)
|
||||||
|
if (n_end-n_start)==num_data:
|
||||||
|
Y_slice = Y
|
||||||
|
X_slice = X
|
||||||
|
else:
|
||||||
|
Y_slice = Y[n_start:n_end]
|
||||||
|
X_slice = X[n_start:n_end]
|
||||||
|
|
||||||
|
if het_noise:
|
||||||
|
b = beta[n_start]
|
||||||
|
YRY_full += np.inner(Y_slice, Y_slice)*b
|
||||||
|
else:
|
||||||
|
b = beta
|
||||||
|
|
||||||
|
if uncertain_inputs:
|
||||||
|
psi0 = kern.psi0(Z, X_slice)
|
||||||
|
psi1 = kern.psi1(Z, X_slice)
|
||||||
|
psi2_full += kern.psi2(Z, X_slice)*b
|
||||||
|
else:
|
||||||
|
psi0 = kern.Kdiag(X_slice)
|
||||||
|
psi1 = kern.K(X_slice, Z)
|
||||||
|
psi2_full += np.dot(psi1.T,psi1)*b
|
||||||
|
|
||||||
|
psi0_full += psi0.sum()*b
|
||||||
|
psi1Y_full += np.dot(Y_slice.T,psi1)*b # DxM
|
||||||
|
|
||||||
|
if not het_noise:
|
||||||
|
YRY_full = trYYT*beta
|
||||||
|
|
||||||
|
if self.mpi_comm != None:
|
||||||
|
psi0_all = np.array(psi0_full)
|
||||||
|
psi1Y_all = psi1Y_full.copy()
|
||||||
|
psi2_all = psi2_full.copy()
|
||||||
|
YRY_all = np.array(YRY_full)
|
||||||
|
self.mpi_comm.Allreduce([psi0_full, MPI.DOUBLE], [psi0_all, MPI.DOUBLE])
|
||||||
|
self.mpi_comm.Allreduce([psi1Y_full, MPI.DOUBLE], [psi1Y_all, MPI.DOUBLE])
|
||||||
|
self.mpi_comm.Allreduce([psi2_full, MPI.DOUBLE], [psi2_all, MPI.DOUBLE])
|
||||||
|
self.mpi_comm.Allreduce([YRY_full, MPI.DOUBLE], [YRY_all, MPI.DOUBLE])
|
||||||
|
return psi0_all, psi1Y_all, psi2_all, YRY_all
|
||||||
|
|
||||||
|
return psi0_full, psi1Y_full, psi2_full, YRY_full
|
||||||
|
|
||||||
def inference_likelihood(self, kern, X, Z, likelihood, Y):
|
def inference_likelihood(self, kern, X, Z, likelihood, Y):
|
||||||
"""
|
"""
|
||||||
The first phase of inference:
|
The first phase of inference:
|
||||||
Compute: log-likelihood, dL_dKmm
|
Compute: log-likelihood, dL_dKmm
|
||||||
|
|
||||||
Cached intermediate results: Kmm, KmmInv,
|
Cached intermediate results: Kmm, KmmInv,
|
||||||
"""
|
"""
|
||||||
|
|
||||||
num_inducing = Z.shape[0]
|
num_data, output_dim = Y.shape
|
||||||
num_data, output_dim = Y.shape
|
if self.mpi_comm != None:
|
||||||
|
num_data_all = np.array(num_data,dtype=np.int32)
|
||||||
|
self.mpi_comm.Allreduce([np.int32(num_data), MPI.INT], [num_data_all, MPI.INT])
|
||||||
|
num_data = num_data_all
|
||||||
|
|
||||||
if isinstance(X, VariationalPosterior):
|
if isinstance(X, VariationalPosterior):
|
||||||
uncertain_inputs = True
|
uncertain_inputs = True
|
||||||
else:
|
else:
|
||||||
uncertain_inputs = False
|
uncertain_inputs = False
|
||||||
|
|
||||||
#see whether we've got a different noise variance for each datum
|
#see whether we've got a different noise variance for each datum
|
||||||
beta = 1./np.fmax(likelihood.variance, 1e-6)
|
beta = 1./np.fmax(likelihood.variance, 1e-6)
|
||||||
het_noise = beta.size > 1
|
het_noise = beta.size > 1
|
||||||
if het_noise:
|
if het_noise:
|
||||||
self.batchsize = 1
|
self.batchsize = 1
|
||||||
# VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
|
|
||||||
#self.YYTfactor = beta*self.get_YYTfactor(Y)
|
|
||||||
YYT_factor = Y
|
|
||||||
trYYT = self.get_trYYT(Y)
|
|
||||||
|
|
||||||
psi2_full = np.zeros((num_inducing,num_inducing))
|
|
||||||
psi1Y_full = np.zeros((output_dim,num_inducing)) # DxM
|
|
||||||
psi0_full = 0
|
|
||||||
YRY_full = 0
|
|
||||||
|
|
||||||
for n_start in xrange(0,num_data,self.batchsize):
|
|
||||||
|
|
||||||
n_end = min(self.batchsize+n_start, num_data)
|
|
||||||
|
|
||||||
Y_slice = YYT_factor[n_start:n_end]
|
|
||||||
X_slice = X[n_start:n_end]
|
|
||||||
|
|
||||||
if uncertain_inputs:
|
|
||||||
psi0 = kern.psi0(Z, X_slice)
|
|
||||||
psi1 = kern.psi1(Z, X_slice)
|
|
||||||
psi2 = kern.psi2(Z, X_slice)
|
|
||||||
else:
|
|
||||||
psi0 = kern.Kdiag(X_slice)
|
|
||||||
psi1 = kern.K(X_slice, Z)
|
|
||||||
psi2 = None
|
|
||||||
|
|
||||||
if het_noise:
|
|
||||||
beta_slice = beta[n_start:n_end]
|
|
||||||
psi0_full += (beta_slice*psi0).sum()
|
|
||||||
psi1Y_full += np.dot(beta_slice*Y_slice.T,psi1) # DxM
|
|
||||||
YRY_full += (beta_slice*np.square(Y_slice).sum(axis=-1)).sum()
|
|
||||||
else:
|
|
||||||
psi0_full += psi0.sum()
|
|
||||||
psi1Y_full += np.dot(Y_slice.T,psi1) # DxM
|
|
||||||
|
|
||||||
if uncertain_inputs:
|
|
||||||
if het_noise:
|
|
||||||
psi2_full += beta_slice*psi2
|
|
||||||
else:
|
|
||||||
psi2_full += psi2.sum(0)
|
|
||||||
else:
|
|
||||||
if het_noise:
|
|
||||||
psi2_full += beta_slice*np.outer(psi1,psi1)
|
|
||||||
else:
|
|
||||||
psi2_full += np.einsum('nm,jk->mk',psi1,psi1)
|
|
||||||
|
|
||||||
if not het_noise:
|
|
||||||
psi0_full *= beta
|
|
||||||
psi1Y_full *= beta
|
|
||||||
psi2_full *= beta
|
|
||||||
YRY_full = trYYT*beta
|
|
||||||
|
|
||||||
|
psi0_full, psi1Y_full, psi2_full, YRY_full = self.gatherPsiStat(kern, X, Z, Y, beta, uncertain_inputs)
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute Common Components
|
# Compute Common Components
|
||||||
#======================================================================
|
#======================================================================
|
||||||
self.psi1Y = psi1Y_full
|
|
||||||
Kmm = kern.K(Z).copy()
|
Kmm = kern.K(Z).copy()
|
||||||
diag.add(Kmm, self.const_jitter)
|
diag.add(Kmm, self.const_jitter)
|
||||||
Lm = jitchol(Kmm)
|
Lm = jitchol(Kmm)
|
||||||
|
|
||||||
Lambda = Kmm+psi2_full
|
Lambda = Kmm+psi2_full
|
||||||
LL = jitchol(Lambda)
|
LL = jitchol(Lambda)
|
||||||
b,_ = dtrtrs(LL, psi1Y_full.T)
|
b,_ = dtrtrs(LL, psi1Y_full.T)
|
||||||
|
|
@ -140,18 +177,18 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
v,_ = dtrtrs(LL.T,b,lower=False)
|
v,_ = dtrtrs(LL.T,b,lower=False)
|
||||||
vvt = np.einsum('md,od->mo',v,v)
|
vvt = np.einsum('md,od->mo',v,v)
|
||||||
LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
|
LmInvPsi2LmInvT = backsub_both_sides(Lm,psi2_full,transpose='right')
|
||||||
|
|
||||||
Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T
|
Psi2LLInvT = dtrtrs(LL,psi2_full)[0].T
|
||||||
LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0]
|
LmInvPsi2LLInvT= dtrtrs(Lm,Psi2LLInvT)[0]
|
||||||
KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0]
|
KmmInvPsi2LLInvT = dtrtrs(Lm,LmInvPsi2LLInvT,trans=True)[0]
|
||||||
KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T
|
KmmInvPsi2P = dtrtrs(LL,KmmInvPsi2LLInvT.T, trans=True)[0].T
|
||||||
|
|
||||||
dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2
|
dL_dpsi2R = (output_dim*KmmInvPsi2P - vvt)/2. # dL_dpsi2 with R inside psi2
|
||||||
|
|
||||||
# Cache intermediate results
|
# Cache intermediate results
|
||||||
self.midRes['dL_dpsi2R'] = dL_dpsi2R
|
self.midRes['dL_dpsi2R'] = dL_dpsi2R
|
||||||
self.midRes['v'] = v
|
self.midRes['v'] = v
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute log-likelihood
|
# Compute log-likelihood
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
@ -159,33 +196,36 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
logL_R = -np.log(beta).sum()
|
logL_R = -np.log(beta).sum()
|
||||||
else:
|
else:
|
||||||
logL_R = -num_data*np.log(beta)
|
logL_R = -num_data*np.log(beta)
|
||||||
logL = (
|
logL = -(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum())
|
||||||
-(output_dim*(num_data*log_2_pi+logL_R+psi0_full-np.trace(LmInvPsi2LmInvT))+YRY_full-bbt)/2.
|
|
||||||
-output_dim*(-np.log(np.diag(Lm)).sum()+np.log(np.diag(LL)).sum())
|
|
||||||
)
|
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute dL_dKmm
|
# Compute dL_dKmm
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
||||||
dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2.
|
dL_dKmm = -(output_dim*np.einsum('md,od->mo',KmmInvPsi2LLInvT,KmmInvPsi2LLInvT) + vvt)/2.
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute the Posterior distribution of inducing points p(u|Y)
|
# Compute the Posterior distribution of inducing points p(u|Y)
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
||||||
# phi_u_mean = np.dot(Kmm,v)
|
if not self.Y_speedup or het_noise:
|
||||||
# LLInvKmm,_ = dtrtrs(LL,Kmm)
|
post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)
|
||||||
# # phi_u_var = np.einsum('ma,mb->ab',LLInvKmm,LLInvKmm)
|
else:
|
||||||
# phi_u_var = Kmm - np.dot(LLInvKmm.T,LLInvKmm)
|
post = None
|
||||||
|
|
||||||
post = Posterior(woodbury_inv=KmmInvPsi2P, woodbury_vector=v, K=Kmm, mean=None, cov=None, K_chol=Lm)
|
#======================================================================
|
||||||
|
# Compute dL_dthetaL for uncertian input and non-heter noise
|
||||||
|
#======================================================================
|
||||||
|
|
||||||
|
if not het_noise:
|
||||||
|
dL_dthetaL = (YRY_full*beta + beta*output_dim*psi0_full - num_data*output_dim*beta)/2. - beta*(dL_dpsi2R*psi2_full).sum() - beta*(v.T*psi1Y_full).sum()
|
||||||
|
self.midRes['dL_dthetaL'] = dL_dthetaL
|
||||||
|
|
||||||
return logL, dL_dKmm, post
|
return logL, dL_dKmm, post
|
||||||
|
|
||||||
def inference_minibatch(self, kern, X, Z, likelihood, Y):
|
def inference_minibatch(self, kern, X, Z, likelihood, Y):
|
||||||
"""
|
"""
|
||||||
The second phase of inference: Computing the derivatives over a minibatch of Y
|
The second phase of inference: Computing the derivatives over a minibatch of Y
|
||||||
Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
|
Compute: dL_dpsi0, dL_dpsi1, dL_dpsi2, dL_dthetaL
|
||||||
return a flag showing whether it reached the end of Y (isEnd)
|
return a flag showing whether it reached the end of Y (isEnd)
|
||||||
"""
|
"""
|
||||||
|
|
@ -196,14 +236,17 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
uncertain_inputs = True
|
uncertain_inputs = True
|
||||||
else:
|
else:
|
||||||
uncertain_inputs = False
|
uncertain_inputs = False
|
||||||
|
|
||||||
#see whether we've got a different noise variance for each datum
|
#see whether we've got a different noise variance for each datum
|
||||||
beta = 1./np.fmax(likelihood.variance, 1e-6)
|
beta = 1./np.fmax(likelihood.variance, 1e-6)
|
||||||
het_noise = beta.size > 1
|
het_noise = beta.size > 1
|
||||||
# VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
|
# VVT_factor is a matrix such that tdot(VVT_factor) = VVT...this is for efficiency!
|
||||||
#self.YYTfactor = beta*self.get_YYTfactor(Y)
|
#self.YYTfactor = beta*self.get_YYTfactor(Y)
|
||||||
YYT_factor = Y
|
if self.Y_speedup and not het_noise:
|
||||||
|
YYT_factor = self.get_YYTfactor(Y)
|
||||||
|
else:
|
||||||
|
YYT_factor = Y
|
||||||
|
|
||||||
n_start = self.batch_pos
|
n_start = self.batch_pos
|
||||||
n_end = min(self.batchsize+n_start, num_data)
|
n_end = min(self.batchsize+n_start, num_data)
|
||||||
if n_end==num_data:
|
if n_end==num_data:
|
||||||
|
|
@ -212,66 +255,64 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
else:
|
else:
|
||||||
isEnd = False
|
isEnd = False
|
||||||
self.batch_pos = n_end
|
self.batch_pos = n_end
|
||||||
|
|
||||||
num_slice = n_end-n_start
|
|
||||||
Y_slice = YYT_factor[n_start:n_end]
|
Y_slice = YYT_factor[n_start:n_end]
|
||||||
X_slice = X[n_start:n_end]
|
X_slice = X[n_start:n_end]
|
||||||
|
|
||||||
if uncertain_inputs:
|
if not uncertain_inputs:
|
||||||
psi0 = kern.psi0(Z, X_slice)
|
|
||||||
psi1 = kern.psi1(Z, X_slice)
|
|
||||||
psi2 = kern.psi2(Z, X_slice)
|
|
||||||
else:
|
|
||||||
psi0 = kern.Kdiag(X_slice)
|
psi0 = kern.Kdiag(X_slice)
|
||||||
psi1 = kern.K(X_slice, Z)
|
psi1 = kern.K(X_slice, Z)
|
||||||
psi2 = None
|
psi2 = None
|
||||||
|
betapsi1 = np.einsum('n,nm->nm',beta,psi1)
|
||||||
|
elif het_noise:
|
||||||
|
psi0 = kern.psi0(Z, X_slice)
|
||||||
|
psi1 = kern.psi1(Z, X_slice)
|
||||||
|
psi2 = kern.psi2(Z, X_slice)
|
||||||
|
betapsi1 = np.einsum('n,nm->nm',beta,psi1)
|
||||||
|
|
||||||
if het_noise:
|
if het_noise:
|
||||||
beta = beta[n_start] # assuming batchsize==1
|
beta = beta[n_start] # assuming batchsize==1
|
||||||
|
|
||||||
betaY = beta*Y_slice
|
betaY = beta*Y_slice
|
||||||
betapsi1 = np.einsum('n,nm->nm',beta,psi1)
|
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Load Intermediate Results
|
# Load Intermediate Results
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
||||||
dL_dpsi2R = self.midRes['dL_dpsi2R']
|
dL_dpsi2R = self.midRes['dL_dpsi2R']
|
||||||
v = self.midRes['v']
|
v = self.midRes['v']
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute dL_dpsi
|
# Compute dL_dpsi
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
||||||
dL_dpsi0 = -0.5 * output_dim * (beta * np.ones((n_end-n_start,)))
|
dL_dpsi0 = -output_dim * (beta * np.ones((n_end-n_start,)))/2.
|
||||||
|
|
||||||
dL_dpsi1 = np.dot(betaY,v.T)
|
dL_dpsi1 = np.dot(betaY,v.T)
|
||||||
|
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
dL_dpsi2 = beta* dL_dpsi2R
|
dL_dpsi2 = beta* dL_dpsi2R
|
||||||
else:
|
else:
|
||||||
dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2.
|
dL_dpsi1 += np.dot(betapsi1,dL_dpsi2R)*2.
|
||||||
dL_dpsi2 = None
|
dL_dpsi2 = None
|
||||||
|
|
||||||
#======================================================================
|
#======================================================================
|
||||||
# Compute dL_dthetaL
|
# Compute dL_dthetaL
|
||||||
#======================================================================
|
#======================================================================
|
||||||
|
|
||||||
if het_noise:
|
if het_noise:
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2)
|
psiR = np.einsum('mo,mo->',dL_dpsi2R,psi2)
|
||||||
else:
|
else:
|
||||||
psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
|
psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
|
||||||
|
|
||||||
dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
|
dL_dthetaL = ((np.square(betaY)).sum(axis=-1) + np.square(beta)*(output_dim*psi0)-output_dim*beta)/2. - np.square(beta)*psiR- (betaY*np.dot(betapsi1,v)).sum(axis=-1)
|
||||||
else:
|
else:
|
||||||
if uncertain_inputs:
|
if isEnd:
|
||||||
psiR = np.einsum('mo,nmo->',dL_dpsi2R,psi2)
|
dL_dthetaL = self.midRes['dL_dthetaL']
|
||||||
else:
|
else:
|
||||||
psiR = np.einsum('nm,no,mo->',psi1,psi1,dL_dpsi2R)
|
dL_dthetaL = 0.
|
||||||
|
|
||||||
dL_dthetaL = ((np.square(betaY)).sum() + beta*beta*output_dim*(psi0.sum())-num_slice*output_dim*beta)/2. - beta*beta*psiR- (betaY*np.dot(betapsi1,v)).sum()
|
|
||||||
|
|
||||||
if uncertain_inputs:
|
if uncertain_inputs:
|
||||||
grad_dict = {'dL_dpsi0':dL_dpsi0,
|
grad_dict = {'dL_dpsi0':dL_dpsi0,
|
||||||
'dL_dpsi1':dL_dpsi1,
|
'dL_dpsi1':dL_dpsi1,
|
||||||
|
|
@ -281,71 +322,92 @@ class VarDTC_minibatch(LatentFunctionInference):
|
||||||
grad_dict = {'dL_dKdiag':dL_dpsi0,
|
grad_dict = {'dL_dKdiag':dL_dpsi0,
|
||||||
'dL_dKnm':dL_dpsi1,
|
'dL_dKnm':dL_dpsi1,
|
||||||
'dL_dthetaL':dL_dthetaL}
|
'dL_dthetaL':dL_dthetaL}
|
||||||
|
|
||||||
return isEnd, (n_start,n_end), grad_dict
|
return isEnd, (n_start,n_end), grad_dict
|
||||||
|
|
||||||
|
|
||||||
def update_gradients(model):
|
def update_gradients(model, mpi_comm=None):
|
||||||
model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, model.X, model.Z, model.likelihood, model.Y)
|
if mpi_comm == None:
|
||||||
|
Y = model.Y
|
||||||
|
X = model.X
|
||||||
|
else:
|
||||||
|
Y = model.Y_local
|
||||||
|
X = model.X[model.N_range[0]:model.N_range[1]]
|
||||||
|
|
||||||
|
model._log_marginal_likelihood, dL_dKmm, model.posterior = model.inference_method.inference_likelihood(model.kern, X, model.Z, model.likelihood, Y)
|
||||||
|
|
||||||
het_noise = model.likelihood.variance.size > 1
|
het_noise = model.likelihood.variance.size > 1
|
||||||
|
|
||||||
if het_noise:
|
if het_noise:
|
||||||
dL_dthetaL = np.empty((model.Y.shape[0],))
|
dL_dthetaL = np.empty((model.Y.shape[0],))
|
||||||
else:
|
else:
|
||||||
dL_dthetaL = 0
|
dL_dthetaL = np.float64(0.)
|
||||||
|
|
||||||
#gradients w.r.t. kernel
|
|
||||||
model.kern.update_gradients_full(dL_dKmm, model.Z, None)
|
|
||||||
kern_grad = model.kern.gradient.copy()
|
kern_grad = model.kern.gradient.copy()
|
||||||
|
kern_grad[:] = 0.
|
||||||
#gradients w.r.t. Z
|
model.Z.gradient = 0.
|
||||||
model.Z.gradient = model.kern.gradients_X(dL_dKmm, model.Z)
|
|
||||||
|
|
||||||
isEnd = False
|
isEnd = False
|
||||||
while not isEnd:
|
while not isEnd:
|
||||||
isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, model.X, model.Z, model.likelihood, model.Y)
|
isEnd, n_range, grad_dict = model.inference_method.inference_minibatch(model.kern, X, model.Z, model.likelihood, Y)
|
||||||
if isinstance(model.X, VariationalPosterior):
|
if isinstance(model.X, VariationalPosterior):
|
||||||
X_slice = model.X[n_range[0]:n_range[1]]
|
if (n_range[1]-n_range[0])==X.shape[0]:
|
||||||
|
X_slice = X
|
||||||
dL_dpsi1 = grad_dict['dL_dpsi1']#[None, :]
|
elif mpi_comm ==None:
|
||||||
dL_dpsi2 = grad_dict['dL_dpsi2'][None, :, :]
|
X_slice = model.X[n_range[0]:n_range[1]]
|
||||||
|
else:
|
||||||
|
X_slice = model.X[model.N_range[0]+n_range[0]:model.N_range[0]+n_range[1]]
|
||||||
|
|
||||||
#gradients w.r.t. kernel
|
#gradients w.r.t. kernel
|
||||||
model.kern.update_gradients_expectations(variational_posterior=X_slice,Z=model.Z,dL_dpsi0=grad_dict['dL_dpsi0'],dL_dpsi1=dL_dpsi1,dL_dpsi2=dL_dpsi2)
|
model.kern.update_gradients_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
|
||||||
kern_grad += model.kern.gradient
|
kern_grad += model.kern.gradient
|
||||||
|
|
||||||
#gradients w.r.t. Z
|
#gradients w.r.t. Z
|
||||||
model.Z.gradient += model.kern.gradients_Z_expectations(
|
model.Z.gradient += model.kern.gradients_Z_expectations(
|
||||||
dL_dpsi0=grad_dict['dL_dpsi0'],
|
dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'], Z=model.Z, variational_posterior=X_slice)
|
||||||
dL_dpsi1=dL_dpsi1,
|
|
||||||
dL_dpsi2=dL_dpsi2,
|
|
||||||
Z=model.Z, variational_posterior=X_slice)
|
|
||||||
|
|
||||||
#gradients w.r.t. posterior parameters of X
|
#gradients w.r.t. posterior parameters of X
|
||||||
X_grad = model.kern.gradients_qX_expectations(
|
X_grad = model.kern.gradients_qX_expectations(variational_posterior=X_slice, Z=model.Z, dL_dpsi0=grad_dict['dL_dpsi0'], dL_dpsi1=grad_dict['dL_dpsi1'], dL_dpsi2=grad_dict['dL_dpsi2'])
|
||||||
variational_posterior=X_slice,
|
model.set_X_gradients(X_slice, X_grad)
|
||||||
Z=model.Z,
|
|
||||||
dL_dpsi0=grad_dict['dL_dpsi0'],
|
|
||||||
dL_dpsi1=dL_dpsi1,
|
|
||||||
dL_dpsi2=dL_dpsi2)
|
|
||||||
|
|
||||||
model.X.mean[n_range[0]:n_range[1]].gradient = X_grad[0]
|
|
||||||
model.X.variance[n_range[0]:n_range[1]].gradient = X_grad[1]
|
|
||||||
|
|
||||||
if het_noise:
|
if het_noise:
|
||||||
dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
|
dL_dthetaL[n_range[0]:n_range[1]] = grad_dict['dL_dthetaL']
|
||||||
else:
|
else:
|
||||||
dL_dthetaL += grad_dict['dL_dthetaL']
|
dL_dthetaL += grad_dict['dL_dthetaL']
|
||||||
#import ipdb;ipdb.set_trace()
|
|
||||||
model.grad_dict = grad_dict
|
# Gather the gradients from multiple MPI nodes
|
||||||
if isinstance(model.X, VariationalPosterior):
|
if mpi_comm != None:
|
||||||
# Update Log-likelihood
|
if het_noise:
|
||||||
model._log_marginal_likelihood -= model.variational_prior.KL_divergence(model.X)
|
assert False, "Not implemented!"
|
||||||
# update for the KL divergence
|
kern_grad_all = kern_grad.copy()
|
||||||
model.variational_prior.update_gradients_KL(model.X)
|
Z_grad_all = model.Z.gradient.copy()
|
||||||
|
mpi_comm.Allreduce([kern_grad, MPI.DOUBLE], [kern_grad_all, MPI.DOUBLE])
|
||||||
|
mpi_comm.Allreduce([model.Z.gradient, MPI.DOUBLE], [Z_grad_all, MPI.DOUBLE])
|
||||||
|
kern_grad = kern_grad_all
|
||||||
|
model.Z.gradient = Z_grad_all
|
||||||
|
|
||||||
|
#gradients w.r.t. kernel
|
||||||
|
model.kern.update_gradients_full(dL_dKmm, model.Z, None)
|
||||||
|
model.kern.gradient += kern_grad
|
||||||
|
|
||||||
# Set the gradients w.r.t. kernel
|
#gradients w.r.t. Z
|
||||||
model.kern.gradient = kern_grad
|
model.Z.gradient += model.kern.gradients_X(dL_dKmm, model.Z)
|
||||||
|
|
||||||
|
# Update Log-likelihood
|
||||||
|
KL_div = model.variational_prior.KL_divergence(X)
|
||||||
|
# update for the KL divergence
|
||||||
|
model.variational_prior.update_gradients_KL(X)
|
||||||
|
|
||||||
|
if mpi_comm != None:
|
||||||
|
KL_div_all = np.array(KL_div)
|
||||||
|
mpi_comm.Allreduce([np.float64(KL_div), MPI.DOUBLE], [KL_div_all, MPI.DOUBLE])
|
||||||
|
KL_div = KL_div_all
|
||||||
|
[mpi_comm.Allgatherv([pp.copy(), MPI.DOUBLE], [pa, (model.N_list*pa.shape[-1], None), MPI.DOUBLE]) for pp,pa in zip(model.get_X_gradients(X),model.get_X_gradients(model.X))]
|
||||||
|
from ...models import SSGPLVM
|
||||||
|
if isinstance(model, SSGPLVM):
|
||||||
|
grad_pi = np.array(model.variational_prior.pi.gradient)
|
||||||
|
mpi_comm.Allreduce([grad_pi.copy(), MPI.DOUBLE], [model.variational_prior.pi.gradient, MPI.DOUBLE])
|
||||||
|
model._log_marginal_likelihood -= KL_div
|
||||||
|
|
||||||
# dL_dthetaL
|
# dL_dthetaL
|
||||||
model.likelihood.update_gradients(dL_dthetaL)
|
model.likelihood.update_gradients(dL_dthetaL)
|
||||||
|
|
|
||||||
|
|
@ -8,12 +8,13 @@ from _src.mlp import MLP
|
||||||
from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
|
from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
|
||||||
from _src.independent_outputs import IndependentOutputs, Hierarchical
|
from _src.independent_outputs import IndependentOutputs, Hierarchical
|
||||||
from _src.coregionalize import Coregionalize
|
from _src.coregionalize import Coregionalize
|
||||||
from _src.ssrbf import SSRBF # TODO: ZD: did you remove this?
|
|
||||||
from _src.ODE_UY import ODE_UY
|
from _src.ODE_UY import ODE_UY
|
||||||
from _src.ODE_UYC import ODE_UYC
|
from _src.ODE_UYC import ODE_UYC
|
||||||
from _src.ODE_st import ODE_st
|
from _src.ODE_st import ODE_st
|
||||||
from _src.ODE_t import ODE_t
|
from _src.ODE_t import ODE_t
|
||||||
from _src.poly import Poly
|
from _src.poly import Poly
|
||||||
|
|
||||||
|
from _src.trunclinear import TruncLinear,TruncLinear_inf
|
||||||
from _src.splitKern import SplitKern,DiffGenomeKern
|
from _src.splitKern import SplitKern,DiffGenomeKern
|
||||||
|
|
||||||
# TODO: put this in an init file somewhere
|
# TODO: put this in an init file somewhere
|
||||||
|
|
|
||||||
|
|
@ -63,13 +63,16 @@ class Add(CombinationKernel):
|
||||||
target = np.zeros(X.shape)
|
target = np.zeros(X.shape)
|
||||||
[target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
|
[target.__iadd__(p.gradients_X_diag(dL_dKdiag, X)) for p in self.parts]
|
||||||
return target
|
return target
|
||||||
|
|
||||||
|
@Cache_this(limit=2, force_kwargs=['which_parts'])
|
||||||
def psi0(self, Z, variational_posterior):
|
def psi0(self, Z, variational_posterior):
|
||||||
return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
|
return reduce(np.add, (p.psi0(Z, variational_posterior) for p in self.parts))
|
||||||
|
|
||||||
|
@Cache_this(limit=2, force_kwargs=['which_parts'])
|
||||||
def psi1(self, Z, variational_posterior):
|
def psi1(self, Z, variational_posterior):
|
||||||
return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
|
return reduce(np.add, (p.psi1(Z, variational_posterior) for p in self.parts))
|
||||||
|
|
||||||
|
@Cache_this(limit=2, force_kwargs=['which_parts'])
|
||||||
def psi2(self, Z, variational_posterior):
|
def psi2(self, Z, variational_posterior):
|
||||||
psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
|
psi2 = reduce(np.add, (p.psi2(Z, variational_posterior) for p in self.parts))
|
||||||
#return psi2
|
#return psi2
|
||||||
|
|
@ -88,17 +91,18 @@ class Add(CombinationKernel):
|
||||||
# rbf X bias
|
# rbf X bias
|
||||||
#elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
|
#elif isinstance(p1, (Bias, Fixed)) and isinstance(p2, (RBF, RBFInv)):
|
||||||
elif isinstance(p1, Bias) and isinstance(p2, (RBF, Linear)):
|
elif isinstance(p1, Bias) and isinstance(p2, (RBF, Linear)):
|
||||||
tmp = p2.psi1(Z, variational_posterior)
|
tmp = p2.psi1(Z, variational_posterior).sum(axis=0)
|
||||||
psi2 += p1.variance * (tmp[:, :, None] + tmp[:, None, :])
|
psi2 += p1.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :])
|
||||||
#elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
|
#elif isinstance(p2, (Bias, Fixed)) and isinstance(p1, (RBF, RBFInv)):
|
||||||
elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
|
elif isinstance(p2, Bias) and isinstance(p1, (RBF, Linear)):
|
||||||
tmp = p1.psi1(Z, variational_posterior)
|
tmp = p1.psi1(Z, variational_posterior).sum(axis=0)
|
||||||
psi2 += p2.variance * (tmp[:, :, None] + tmp[:, None, :])
|
psi2 += p2.variance * (tmp[:,None]+tmp[None,:]) #(tmp[:, :, None] + tmp[:, None, :])
|
||||||
elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
|
elif isinstance(p2, (RBF, Linear)) and isinstance(p1, (RBF, Linear)):
|
||||||
assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
|
assert np.intersect1d(p1.active_dims, p2.active_dims).size == 0, "only non overlapping kernel dimensions allowed so far"
|
||||||
tmp1 = p1.psi1(Z, variational_posterior)
|
tmp1 = p1.psi1(Z, variational_posterior)
|
||||||
tmp2 = p2.psi1(Z, variational_posterior)
|
tmp2 = p2.psi1(Z, variational_posterior)
|
||||||
psi2 += (tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
|
psi2 += np.einsum('nm,no->mo',tmp1,tmp2)+np.einsum('nm,no->mo',tmp2,tmp1)
|
||||||
|
#(tmp1[:, :, None] * tmp2[:, None, :]) + (tmp2[:, :, None] * tmp1[:, None, :])
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError, "psi2 cannot be computed for this kernel"
|
raise NotImplementedError, "psi2 cannot be computed for this kernel"
|
||||||
return psi2
|
return psi2
|
||||||
|
|
|
||||||
|
|
@ -3,16 +3,13 @@
|
||||||
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy import weave
|
|
||||||
from kern import Kern
|
from kern import Kern
|
||||||
from ...util.linalg import tdot
|
from ...util.linalg import tdot
|
||||||
from ...util.misc import param_to_array
|
|
||||||
from ...core.parameterization import Param
|
from ...core.parameterization import Param
|
||||||
from ...core.parameterization.transformations import Logexp
|
from ...core.parameterization.transformations import Logexp
|
||||||
from ...util.caching import Cache_this
|
from ...util.caching import Cache_this
|
||||||
from ...core.parameterization import variational
|
|
||||||
from psi_comp import linear_psi_comp
|
|
||||||
from ...util.config import *
|
from ...util.config import *
|
||||||
|
from .psi_comp import PSICOMP_Linear
|
||||||
|
|
||||||
class Linear(Kern):
|
class Linear(Kern):
|
||||||
"""
|
"""
|
||||||
|
|
@ -53,7 +50,8 @@ class Linear(Kern):
|
||||||
|
|
||||||
self.variances = Param('variances', variances, Logexp())
|
self.variances = Param('variances', variances, Logexp())
|
||||||
self.add_parameter(self.variances)
|
self.add_parameter(self.variances)
|
||||||
|
self.psicomp = PSICOMP_Linear()
|
||||||
|
|
||||||
@Cache_this(limit=2)
|
@Cache_this(limit=2)
|
||||||
def K(self, X, X2=None):
|
def K(self, X, X2=None):
|
||||||
if self.ARD:
|
if self.ARD:
|
||||||
|
|
@ -95,243 +93,43 @@ class Linear(Kern):
|
||||||
|
|
||||||
def gradients_X(self, dL_dK, X, X2=None):
|
def gradients_X(self, dL_dK, X, X2=None):
|
||||||
if X2 is None:
|
if X2 is None:
|
||||||
return 2.*(((X[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
|
return np.einsum('mq,nm->nq',X*self.variances,dL_dK)+np.einsum('nq,nm->mq',X*self.variances,dL_dK)
|
||||||
else:
|
else:
|
||||||
return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
|
return (((X2[None,:, :] * self.variances)) * dL_dK[:, :, None]).sum(1)
|
||||||
|
|
||||||
def gradients_X_diag(self, dL_dKdiag, X):
|
def gradients_X_diag(self, dL_dKdiag, X):
|
||||||
return 2.*self.variances*dL_dKdiag[:,None]*X
|
return 2.*self.variances*dL_dKdiag[:,None]*X
|
||||||
|
|
||||||
|
def input_sensitivity(self):
|
||||||
|
return np.ones(self.input_dim) * self.variances
|
||||||
|
|
||||||
#---------------------------------------#
|
#---------------------------------------#
|
||||||
# PSI statistics #
|
# PSI statistics #
|
||||||
#---------------------------------------#
|
#---------------------------------------#
|
||||||
|
|
||||||
def psi0(self, Z, variational_posterior):
|
def psi0(self, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[0]
|
||||||
gamma = variational_posterior.binary_prob
|
|
||||||
mu = variational_posterior.mean
|
|
||||||
S = variational_posterior.variance
|
|
||||||
|
|
||||||
return np.einsum('q,nq,nq->n',self.variances,gamma,np.square(mu)+S)
|
|
||||||
# return (self.variances*gamma*(np.square(mu)+S)).sum(axis=1)
|
|
||||||
else:
|
|
||||||
return np.sum(self.variances * self._mu2S(variational_posterior), 1)
|
|
||||||
|
|
||||||
def psi1(self, Z, variational_posterior):
|
def psi1(self, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[1]
|
||||||
gamma = variational_posterior.binary_prob
|
|
||||||
mu = variational_posterior.mean
|
|
||||||
return np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu)
|
|
||||||
# return (self.variances*gamma*mu).sum(axis=1)
|
|
||||||
else:
|
|
||||||
return self.K(variational_posterior.mean, Z) #the variance, it does nothing
|
|
||||||
|
|
||||||
@Cache_this(limit=1)
|
@Cache_this(limit=1)
|
||||||
def psi2(self, Z, variational_posterior):
|
def psi2(self, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
return self.psicomp.psicomputations(self.variances, Z, variational_posterior)[2]
|
||||||
gamma = variational_posterior.binary_prob
|
|
||||||
mu = variational_posterior.mean
|
|
||||||
S = variational_posterior.variance
|
|
||||||
mu2 = np.square(mu)
|
|
||||||
variances2 = np.square(self.variances)
|
|
||||||
tmp = np.einsum('nq,q,mq,nq->nm',gamma,self.variances,Z,mu)
|
|
||||||
return np.einsum('nq,q,mq,oq,nq->nmo',gamma,variances2,Z,Z,mu2+S)+\
|
|
||||||
np.einsum('nm,no->nmo',tmp,tmp) - np.einsum('nq,q,mq,oq,nq->nmo',np.square(gamma),variances2,Z,Z,mu2)
|
|
||||||
else:
|
|
||||||
ZA = Z * self.variances
|
|
||||||
ZAinner = self._ZAinner(variational_posterior, Z)
|
|
||||||
return np.dot(ZAinner, ZA.T)
|
|
||||||
|
|
||||||
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
dL_dvar = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[0]
|
||||||
gamma = variational_posterior.binary_prob
|
if self.ARD:
|
||||||
mu = variational_posterior.mean
|
self.variances.gradient = dL_dvar
|
||||||
S = variational_posterior.variance
|
|
||||||
mu2S = np.square(mu)+S
|
|
||||||
_dpsi2_dvariance, _, _, _, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma)
|
|
||||||
grad = np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu) +\
|
|
||||||
np.einsum('nmo,nmoq->q',dL_dpsi2,_dpsi2_dvariance)
|
|
||||||
if self.ARD:
|
|
||||||
self.variances.gradient = grad
|
|
||||||
else:
|
|
||||||
self.variances.gradient = grad.sum()
|
|
||||||
else:
|
else:
|
||||||
#psi1
|
self.variances.gradient = dL_dvar.sum()
|
||||||
self.update_gradients_full(dL_dpsi1, variational_posterior.mean, Z)
|
|
||||||
# psi0:
|
|
||||||
tmp = dL_dpsi0[:, None] * self._mu2S(variational_posterior)
|
|
||||||
if self.ARD: self.variances.gradient += tmp.sum(0)
|
|
||||||
else: self.variances.gradient += tmp.sum()
|
|
||||||
#psi2
|
|
||||||
if self.ARD:
|
|
||||||
tmp = dL_dpsi2[:, :, :, None] * (self._ZAinner(variational_posterior, Z)[:, :, None, :] * Z[None, None, :, :])
|
|
||||||
self.variances.gradient += 2.*tmp.sum(0).sum(0).sum(0)
|
|
||||||
else:
|
|
||||||
self.variances.gradient += 2.*np.sum(dL_dpsi2 * self.psi2(Z, variational_posterior))/self.variances
|
|
||||||
|
|
||||||
def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[1]
|
||||||
gamma = variational_posterior.binary_prob
|
|
||||||
mu = variational_posterior.mean
|
|
||||||
S = variational_posterior.variance
|
|
||||||
_, _, _, _, _dpsi2_dZ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma)
|
|
||||||
|
|
||||||
grad = np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, self.variances,mu) +\
|
|
||||||
np.einsum('nmo,noq->mq',dL_dpsi2,_dpsi2_dZ)
|
|
||||||
|
|
||||||
return grad
|
|
||||||
else:
|
|
||||||
#psi1
|
|
||||||
grad = self.gradients_X(dL_dpsi1.T, Z, variational_posterior.mean)
|
|
||||||
#psi2
|
|
||||||
self._weave_dpsi2_dZ(dL_dpsi2, Z, variational_posterior, grad)
|
|
||||||
return grad
|
|
||||||
|
|
||||||
def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variances, Z, variational_posterior)[2:]
|
||||||
gamma = variational_posterior.binary_prob
|
|
||||||
mu = variational_posterior.mean
|
|
||||||
S = variational_posterior.variance
|
|
||||||
mu2S = np.square(mu)+S
|
|
||||||
_, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _ = linear_psi_comp._psi2computations(self.variances, Z, mu, S, gamma)
|
|
||||||
|
|
||||||
grad_gamma = np.einsum('n,q,nq->nq',dL_dpsi0,self.variances,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,self.variances,Z,mu) +\
|
|
||||||
np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dgamma)
|
|
||||||
grad_mu = np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*self.variances,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,self.variances,Z) +\
|
|
||||||
np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dmu)
|
|
||||||
grad_S = np.einsum('n,nq,q->nq',dL_dpsi0,gamma,self.variances) + np.einsum('nmo,nmoq->nq',dL_dpsi2,_dpsi2_dS)
|
|
||||||
|
|
||||||
return grad_mu, grad_S, grad_gamma
|
|
||||||
else:
|
|
||||||
grad_mu, grad_S = np.zeros(variational_posterior.mean.shape), np.zeros(variational_posterior.mean.shape)
|
|
||||||
# psi0
|
|
||||||
grad_mu += dL_dpsi0[:, None] * (2.0 * variational_posterior.mean * self.variances)
|
|
||||||
grad_S += dL_dpsi0[:, None] * self.variances
|
|
||||||
# psi1
|
|
||||||
grad_mu += (dL_dpsi1[:, :, None] * (Z * self.variances)).sum(1)
|
|
||||||
# psi2
|
|
||||||
self._weave_dpsi2_dmuS(dL_dpsi2, Z, variational_posterior, grad_mu, grad_S)
|
|
||||||
|
|
||||||
return grad_mu, grad_S
|
|
||||||
|
|
||||||
#--------------------------------------------------#
|
|
||||||
# Helpers for psi statistics #
|
|
||||||
#--------------------------------------------------#
|
|
||||||
|
|
||||||
|
|
||||||
def _weave_dpsi2_dmuS(self, dL_dpsi2, Z, vp, target_mu, target_S):
|
|
||||||
# Think N,num_inducing,num_inducing,input_dim
|
|
||||||
ZA = Z * self.variances
|
|
||||||
AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :]
|
|
||||||
AZZA = AZZA + AZZA.swapaxes(1, 2)
|
|
||||||
AZZA_2 = AZZA/2.
|
|
||||||
if config.getboolean('parallel', 'openmp'):
|
|
||||||
pragma_string = '#pragma omp parallel for private(m,mm,q,qq,factor,tmp)'
|
|
||||||
header_string = '#include <omp.h>'
|
|
||||||
weave_options = {'headers' : ['<omp.h>'],
|
|
||||||
'extra_compile_args': ['-fopenmp -O3'],
|
|
||||||
'extra_link_args' : ['-lgomp'],
|
|
||||||
'libraries': ['gomp']}
|
|
||||||
else:
|
|
||||||
pragma_string = ''
|
|
||||||
header_string = ''
|
|
||||||
weave_options = {'extra_compile_args': ['-O3']}
|
|
||||||
|
|
||||||
#Using weave, we can exploit the symmetry of this problem:
|
|
||||||
code = """
|
|
||||||
int n, m, mm,q,qq;
|
|
||||||
double factor,tmp;
|
|
||||||
%s
|
|
||||||
for(n=0;n<N;n++){
|
|
||||||
for(m=0;m<num_inducing;m++){
|
|
||||||
for(mm=0;mm<=m;mm++){
|
|
||||||
//add in a factor of 2 for the off-diagonal terms (and then count them only once)
|
|
||||||
if(m==mm)
|
|
||||||
factor = dL_dpsi2(n,m,mm);
|
|
||||||
else
|
|
||||||
factor = 2.0*dL_dpsi2(n,m,mm);
|
|
||||||
|
|
||||||
for(q=0;q<input_dim;q++){
|
|
||||||
|
|
||||||
//take the dot product of mu[n,:] and AZZA[:,m,mm,q] TODO: blas!
|
|
||||||
tmp = 0.0;
|
|
||||||
for(qq=0;qq<input_dim;qq++){
|
|
||||||
tmp += mu(n,qq)*AZZA(qq,m,mm,q);
|
|
||||||
}
|
|
||||||
|
|
||||||
target_mu(n,q) += factor*tmp;
|
|
||||||
target_S(n,q) += factor*AZZA_2(q,m,mm,q);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
""" % pragma_string
|
|
||||||
support_code = """
|
|
||||||
%s
|
|
||||||
#include <math.h>
|
|
||||||
""" % header_string
|
|
||||||
mu = vp.mean
|
|
||||||
N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu)
|
|
||||||
weave.inline(code, support_code=support_code,
|
|
||||||
arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'],
|
|
||||||
type_converters=weave.converters.blitz,**weave_options)
|
|
||||||
|
|
||||||
|
|
||||||
def _weave_dpsi2_dZ(self, dL_dpsi2, Z, vp, target):
|
|
||||||
AZA = self.variances*self._ZAinner(vp, Z)
|
|
||||||
|
|
||||||
if config.getboolean('parallel', 'openmp'):
|
|
||||||
pragma_string = '#pragma omp parallel for private(n,mm,q)'
|
|
||||||
header_string = '#include <omp.h>'
|
|
||||||
weave_options = {'headers' : ['<omp.h>'],
|
|
||||||
'extra_compile_args': ['-fopenmp -O3'],
|
|
||||||
'extra_link_args' : ['-lgomp'],
|
|
||||||
'libraries': ['gomp']}
|
|
||||||
else:
|
|
||||||
pragma_string = ''
|
|
||||||
header_string = ''
|
|
||||||
weave_options = {'extra_compile_args': ['-O3']}
|
|
||||||
|
|
||||||
code="""
|
|
||||||
int n,m,mm,q;
|
|
||||||
%s
|
|
||||||
for(m=0;m<num_inducing;m++){
|
|
||||||
for(q=0;q<input_dim;q++){
|
|
||||||
for(mm=0;mm<num_inducing;mm++){
|
|
||||||
for(n=0;n<N;n++){
|
|
||||||
target(m,q) += 2*dL_dpsi2(n,m,mm)*AZA(n,mm,q);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
""" % pragma_string
|
|
||||||
support_code = """
|
|
||||||
%s
|
|
||||||
#include <math.h>
|
|
||||||
""" % header_string
|
|
||||||
|
|
||||||
N,num_inducing,input_dim = vp.mean.shape[0],Z.shape[0],vp.mean.shape[1]
|
|
||||||
mu = param_to_array(vp.mean)
|
|
||||||
weave.inline(code, support_code=support_code,
|
|
||||||
arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'],
|
|
||||||
type_converters=weave.converters.blitz,**weave_options)
|
|
||||||
|
|
||||||
|
|
||||||
@Cache_this(limit=1, ignore_args=(0,))
|
|
||||||
def _mu2S(self, vp):
|
|
||||||
return np.square(vp.mean) + vp.variance
|
|
||||||
|
|
||||||
@Cache_this(limit=1)
|
|
||||||
def _ZAinner(self, vp, Z):
|
|
||||||
ZA = Z*self.variances
|
|
||||||
inner = (vp.mean[:, None, :] * vp.mean[:, :, None])
|
|
||||||
diag_indices = np.diag_indices(vp.mean.shape[1], 2)
|
|
||||||
inner[:, diag_indices[0], diag_indices[1]] += vp.variance
|
|
||||||
|
|
||||||
return np.dot(ZA, inner).swapaxes(0, 1) # NOTE: self.ZAinner \in [num_inducing x num_data x input_dim]!
|
|
||||||
|
|
||||||
def input_sensitivity(self):
|
|
||||||
return np.ones(self.input_dim) * self.variances
|
|
||||||
|
|
||||||
class LinearFull(Kern):
|
class LinearFull(Kern):
|
||||||
def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
|
def __init__(self, input_dim, rank, W=None, kappa=None, active_dims=None, name='linear_full'):
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,50 @@
|
||||||
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
from ....core.parameterization.parameter_core import Pickleable
|
||||||
|
from GPy.util.caching import Cache_this
|
||||||
|
from ....core.parameterization import variational
|
||||||
|
import rbf_psi_comp
|
||||||
|
import ssrbf_psi_comp
|
||||||
|
import sslinear_psi_comp
|
||||||
|
import linear_psi_comp
|
||||||
|
|
||||||
|
class PSICOMP_RBF(object):
|
||||||
|
|
||||||
|
@Cache_this(limit=2, ignore_args=(0,))
|
||||||
|
def psicomputations(self, variance, lengthscale, Z, variational_posterior):
|
||||||
|
if isinstance(variational_posterior, variational.NormalPosterior):
|
||||||
|
return rbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
|
||||||
|
elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
||||||
|
return ssrbf_psi_comp.psicomputations(variance, lengthscale, Z, variational_posterior)
|
||||||
|
else:
|
||||||
|
raise ValueError, "unknown distriubtion received for psi-statistics"
|
||||||
|
|
||||||
|
@Cache_this(limit=2, ignore_args=(0,1,2,3))
|
||||||
|
def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
||||||
|
if isinstance(variational_posterior, variational.NormalPosterior):
|
||||||
|
return rbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
|
||||||
|
elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
||||||
|
return ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior)
|
||||||
|
else:
|
||||||
|
raise ValueError, "unknown distriubtion received for psi-statistics"
|
||||||
|
|
||||||
|
class PSICOMP_Linear(object):
|
||||||
|
|
||||||
|
@Cache_this(limit=2, ignore_args=(0,))
|
||||||
|
def psicomputations(self, variance, Z, variational_posterior):
|
||||||
|
if isinstance(variational_posterior, variational.NormalPosterior):
|
||||||
|
return linear_psi_comp.psicomputations(variance, Z, variational_posterior)
|
||||||
|
elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
||||||
|
return sslinear_psi_comp.psicomputations(variance, Z, variational_posterior)
|
||||||
|
else:
|
||||||
|
raise ValueError, "unknown distriubtion received for psi-statistics"
|
||||||
|
|
||||||
|
@Cache_this(limit=2, ignore_args=(0,1,2,3))
|
||||||
|
def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
|
||||||
|
if isinstance(variational_posterior, variational.NormalPosterior):
|
||||||
|
return linear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
|
||||||
|
elif isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
||||||
|
return sslinear_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior)
|
||||||
|
else:
|
||||||
|
raise ValueError, "unknown distriubtion received for psi-statistics"
|
||||||
|
|
@ -2,14 +2,47 @@
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
"""
|
"""
|
||||||
The package for the Psi statistics computation of the linear kernel for SSGPLVM
|
The package for the Psi statistics computation of the linear kernel for Bayesian GPLVM
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from GPy.util.caching import Cache_this
|
|
||||||
|
|
||||||
#@Cache_this(limit=1)
|
def psicomputations(variance, Z, variational_posterior):
|
||||||
def _psi2computations(variance, Z, mu, S, gamma):
|
"""
|
||||||
|
Compute psi-statistics for ss-linear kernel
|
||||||
|
"""
|
||||||
|
# here are the "statistics" for psi0, psi1 and psi2
|
||||||
|
# Produced intermediate results:
|
||||||
|
# psi0 N
|
||||||
|
# psi1 NxM
|
||||||
|
# psi2 MxM
|
||||||
|
mu = variational_posterior.mean
|
||||||
|
S = variational_posterior.variance
|
||||||
|
|
||||||
|
psi0 = np.einsum('q,nq->n',variance,np.square(mu)+S)
|
||||||
|
psi1 = np.einsum('q,mq,nq->nm',variance,Z,mu)
|
||||||
|
|
||||||
|
tmp = np.einsum('q,mq,nq->nm',variance,Z,mu)
|
||||||
|
psi2 = np.einsum('q,mq,oq,nq->mo',np.square(variance),Z,Z,S) + np.einsum('nm,no->mo',tmp,tmp)
|
||||||
|
|
||||||
|
return psi0, psi1, psi2
|
||||||
|
|
||||||
|
def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
|
||||||
|
mu = variational_posterior.mean
|
||||||
|
S = variational_posterior.variance
|
||||||
|
|
||||||
|
dL_dvar, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S)
|
||||||
|
|
||||||
|
# Compute for psi0 and psi1
|
||||||
|
mu2S = np.square(mu)+S
|
||||||
|
dL_dvar += np.einsum('n,nq->q',dL_dpsi0,mu2S) + np.einsum('nm,mq,nq->q',dL_dpsi1,Z,mu)
|
||||||
|
dL_dmu += np.einsum('n,q,nq->nq',dL_dpsi0,2.*variance,mu) + np.einsum('nm,q,mq->nq',dL_dpsi1,variance,Z)
|
||||||
|
dL_dS += np.einsum('n,q->nq',dL_dpsi0,variance)
|
||||||
|
dL_dZ += np.einsum('nm,q,nq->mq',dL_dpsi1, variance,mu)
|
||||||
|
|
||||||
|
return dL_dvar, dL_dZ, dL_dmu, dL_dS
|
||||||
|
|
||||||
|
def _psi2computations(dL_dpsi2, variance, Z, mu, S):
|
||||||
"""
|
"""
|
||||||
Z - MxQ
|
Z - MxQ
|
||||||
mu - NxQ
|
mu - NxQ
|
||||||
|
|
@ -18,34 +51,24 @@ def _psi2computations(variance, Z, mu, S, gamma):
|
||||||
"""
|
"""
|
||||||
# here are the "statistics" for psi1 and psi2
|
# here are the "statistics" for psi1 and psi2
|
||||||
# Produced intermediate results:
|
# Produced intermediate results:
|
||||||
# _psi2 NxMxM
|
# _psi2_dvariance Q
|
||||||
# _psi2_dvariance NxMxMxQ
|
# _psi2_dZ MxQ
|
||||||
# _psi2_dZ NxMxQ
|
# _psi2_dmu NxQ
|
||||||
# _psi2_dgamma NxMxMxQ
|
# _psi2_dS NxQ
|
||||||
# _psi2_dmu NxMxMxQ
|
|
||||||
# _psi2_dS NxMxMxQ
|
|
||||||
|
|
||||||
mu2 = np.square(mu)
|
|
||||||
gamma2 = np.square(gamma)
|
|
||||||
variance2 = np.square(variance)
|
variance2 = np.square(variance)
|
||||||
mu2S = mu2+S # NxQ
|
common_sum = np.einsum('q,mq,nq->nm',variance,Z,mu) # NxM
|
||||||
common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
|
|
||||||
|
|
||||||
_dpsi2_dvariance = np.einsum('nq,q,mq,oq->nmoq',2.*(gamma*mu2S-gamma2*mu2),variance,Z,Z)+\
|
|
||||||
np.einsum('nq,mq,nq,no->nmoq',gamma,Z,mu,common_sum)+\
|
|
||||||
np.einsum('nq,oq,nq,nm->nmoq',gamma,Z,mu,common_sum)
|
|
||||||
|
|
||||||
_dpsi2_dgamma = np.einsum('q,mq,oq,nq->nmoq',variance2,Z,Z,(mu2S-2.*gamma*mu2))+\
|
|
||||||
np.einsum('q,mq,nq,no->nmoq',variance,Z,mu,common_sum)+\
|
|
||||||
np.einsum('q,oq,nq,nm->nmoq',variance,Z,mu,common_sum)
|
|
||||||
|
|
||||||
_dpsi2_dmu = np.einsum('q,mq,oq,nq,nq->nmoq',variance2,Z,Z,mu,2.*(gamma-gamma2))+\
|
|
||||||
np.einsum('nq,q,mq,no->nmoq',gamma,variance,Z,common_sum)+\
|
|
||||||
np.einsum('nq,q,oq,nm->nmoq',gamma,variance,Z,common_sum)
|
|
||||||
|
|
||||||
_dpsi2_dS = np.einsum('nq,q,mq,oq->nmoq',gamma,variance2,Z,Z)
|
|
||||||
|
|
||||||
_dpsi2_dZ = 2.*(np.einsum('nq,q,mq,nq->nmq',gamma,variance2,Z,mu2S)+np.einsum('nq,q,nq,nm->nmq',gamma,variance,mu,common_sum)
|
|
||||||
-np.einsum('nq,q,mq,nq->nmq',gamma2,variance2,Z,mu2))
|
|
||||||
|
|
||||||
return _dpsi2_dvariance, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _dpsi2_dZ
|
dL_dvar = np.einsum('mo,nq,q,mq,oq->q',dL_dpsi2,2.*S,variance,Z,Z)+\
|
||||||
|
np.einsum('mo,mq,nq,no->q',dL_dpsi2,Z,mu,common_sum)+\
|
||||||
|
np.einsum('mo,oq,nq,nm->q',dL_dpsi2,Z,mu,common_sum)
|
||||||
|
|
||||||
|
dL_dmu = np.einsum('mo,q,mq,no->nq',dL_dpsi2,variance,Z,common_sum)+\
|
||||||
|
np.einsum('mo,q,oq,nm->nq',dL_dpsi2,variance,Z,common_sum)
|
||||||
|
|
||||||
|
dL_dS = np.empty(S.shape)
|
||||||
|
dL_dS[:] = np.einsum('mo,q,mq,oq->q',dL_dpsi2,variance2,Z,Z)
|
||||||
|
|
||||||
|
dL_dZ = 2.*(np.einsum('om,q,mq,nq->oq',dL_dpsi2,variance2,Z,S)+np.einsum('om,q,nq,nm->oq',dL_dpsi2,variance,mu,common_sum))
|
||||||
|
|
||||||
|
return dL_dvar, dL_dmu, dL_dS, dL_dZ
|
||||||
|
|
|
||||||
162
GPy/kern/_src/psi_comp/rbf_psi_comp.py
Normal file
162
GPy/kern/_src/psi_comp/rbf_psi_comp.py
Normal file
|
|
@ -0,0 +1,162 @@
|
||||||
|
"""
|
||||||
|
The module for psi-statistics for RBF kernel
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from GPy.util.caching import Cacher
|
||||||
|
|
||||||
|
def psicomputations(variance, lengthscale, Z, variational_posterior):
|
||||||
|
"""
|
||||||
|
Z - MxQ
|
||||||
|
mu - NxQ
|
||||||
|
S - NxQ
|
||||||
|
gamma - NxQ
|
||||||
|
"""
|
||||||
|
# here are the "statistics" for psi0, psi1 and psi2
|
||||||
|
# Produced intermediate results:
|
||||||
|
# _psi1 NxM
|
||||||
|
mu = variational_posterior.mean
|
||||||
|
S = variational_posterior.variance
|
||||||
|
|
||||||
|
psi0 = np.empty(mu.shape[0])
|
||||||
|
psi0[:] = variance
|
||||||
|
psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
|
||||||
|
psi2 = _psi2computations(variance, lengthscale, Z, mu, S).sum(axis=0)
|
||||||
|
return psi0, psi1, psi2
|
||||||
|
|
||||||
|
def __psi1computations(variance, lengthscale, Z, mu, S):
|
||||||
|
"""
|
||||||
|
Z - MxQ
|
||||||
|
mu - NxQ
|
||||||
|
S - NxQ
|
||||||
|
gamma - NxQ
|
||||||
|
"""
|
||||||
|
# here are the "statistics" for psi1
|
||||||
|
# Produced intermediate results:
|
||||||
|
# _psi1 NxM
|
||||||
|
|
||||||
|
lengthscale2 = np.square(lengthscale)
|
||||||
|
|
||||||
|
# psi1
|
||||||
|
_psi1_logdenom = np.log(S/lengthscale2+1.).sum(axis=-1) # N
|
||||||
|
_psi1_log = (_psi1_logdenom[:,None]+np.einsum('nmq,nq->nm',np.square(mu[:,None,:]-Z[None,:,:]),1./(S+lengthscale2)))/(-2.)
|
||||||
|
_psi1 = variance*np.exp(_psi1_log)
|
||||||
|
|
||||||
|
return _psi1
|
||||||
|
|
||||||
|
def __psi2computations(variance, lengthscale, Z, mu, S):
|
||||||
|
"""
|
||||||
|
Z - MxQ
|
||||||
|
mu - NxQ
|
||||||
|
S - NxQ
|
||||||
|
gamma - NxQ
|
||||||
|
"""
|
||||||
|
# here are the "statistics" for psi2
|
||||||
|
# Produced intermediate results:
|
||||||
|
# _psi2 MxM
|
||||||
|
|
||||||
|
lengthscale2 = np.square(lengthscale)
|
||||||
|
|
||||||
|
_psi2_logdenom = np.log(2.*S/lengthscale2+1.).sum(axis=-1)/(-2.) # N
|
||||||
|
_psi2_exp1 = (np.square(Z[:,None,:]-Z[None,:,:])/lengthscale2).sum(axis=-1)/(-4.) #MxM
|
||||||
|
Z_hat = (Z[:,None,:]+Z[None,:,:])/2. #MxMxQ
|
||||||
|
denom = 1./(2.*S+lengthscale2)
|
||||||
|
_psi2_exp2 = -(np.square(mu)*denom).sum(axis=-1)[:,None,None]+2.*np.einsum('nq,moq,nq->nmo',mu,Z_hat,denom)-np.einsum('moq,nq->nmo',np.square(Z_hat),denom)
|
||||||
|
_psi2 = variance*variance*np.exp(_psi2_logdenom[:,None,None]+_psi2_exp1[None,:,:]+_psi2_exp2)
|
||||||
|
|
||||||
|
|
||||||
|
return _psi2
|
||||||
|
|
||||||
|
def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
||||||
|
ARD = (len(lengthscale)!=1)
|
||||||
|
|
||||||
|
dvar_psi1, dl_psi1, dZ_psi1, dmu_psi1, dS_psi1 = _psi1compDer(dL_dpsi1, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
|
||||||
|
dvar_psi2, dl_psi2, dZ_psi2, dmu_psi2, dS_psi2 = _psi2compDer(dL_dpsi2, variance, lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
|
||||||
|
|
||||||
|
dL_dvar = np.sum(dL_dpsi0) + dvar_psi1 + dvar_psi2
|
||||||
|
|
||||||
|
dL_dlengscale = dl_psi1 + dl_psi2
|
||||||
|
if not ARD:
|
||||||
|
dL_dlengscale = dL_dlengscale.sum()
|
||||||
|
|
||||||
|
dL_dmu = dmu_psi1 + dmu_psi2
|
||||||
|
dL_dS = dS_psi1 + dS_psi2
|
||||||
|
dL_dZ = dZ_psi1 + dZ_psi2
|
||||||
|
|
||||||
|
return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
|
||||||
|
|
||||||
|
def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S):
|
||||||
|
"""
|
||||||
|
dL_dpsi1 - NxM
|
||||||
|
Z - MxQ
|
||||||
|
mu - NxQ
|
||||||
|
S - NxQ
|
||||||
|
gamma - NxQ
|
||||||
|
"""
|
||||||
|
# here are the "statistics" for psi1
|
||||||
|
# Produced intermediate results: dL_dparams w.r.t. psi1
|
||||||
|
# _dL_dvariance 1
|
||||||
|
# _dL_dlengthscale Q
|
||||||
|
# _dL_dZ MxQ
|
||||||
|
# _dL_dgamma NxQ
|
||||||
|
# _dL_dmu NxQ
|
||||||
|
# _dL_dS NxQ
|
||||||
|
|
||||||
|
lengthscale2 = np.square(lengthscale)
|
||||||
|
|
||||||
|
_psi1 = _psi1computations(variance, lengthscale, Z, mu, S)
|
||||||
|
Lpsi1 = dL_dpsi1*_psi1
|
||||||
|
Zmu = Z[None,:,:]-mu[:,None,:] # NxMxQ
|
||||||
|
denom = 1./(S+lengthscale2)
|
||||||
|
Zmu2_denom = np.square(Zmu)*denom[:,None,:] #NxMxQ
|
||||||
|
_dL_dvar = Lpsi1.sum()/variance
|
||||||
|
_dL_dmu = np.einsum('nm,nmq,nq->nq',Lpsi1,Zmu,denom)
|
||||||
|
_dL_dS = np.einsum('nm,nmq,nq->nq',Lpsi1,(Zmu2_denom-1.),denom)/2.
|
||||||
|
_dL_dZ = -np.einsum('nm,nmq,nq->mq',Lpsi1,Zmu,denom)
|
||||||
|
_dL_dl = np.einsum('nm,nmq,nq->q',Lpsi1,(Zmu2_denom+(S/lengthscale2)[:,None,:]),denom*lengthscale)
|
||||||
|
|
||||||
|
return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
|
||||||
|
|
||||||
|
def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S):
|
||||||
|
"""
|
||||||
|
Z - MxQ
|
||||||
|
mu - NxQ
|
||||||
|
S - NxQ
|
||||||
|
gamma - NxQ
|
||||||
|
dL_dpsi2 - MxM
|
||||||
|
"""
|
||||||
|
# here are the "statistics" for psi2
|
||||||
|
# Produced the derivatives w.r.t. psi2:
|
||||||
|
# _dL_dvariance 1
|
||||||
|
# _dL_dlengthscale Q
|
||||||
|
# _dL_dZ MxQ
|
||||||
|
# _dL_dgamma NxQ
|
||||||
|
# _dL_dmu NxQ
|
||||||
|
# _dL_dS NxQ
|
||||||
|
|
||||||
|
lengthscale2 = np.square(lengthscale)
|
||||||
|
denom = 1./(2*S+lengthscale2)
|
||||||
|
denom2 = np.square(denom)
|
||||||
|
|
||||||
|
_psi2 = _psi2computations(variance, lengthscale, Z, mu, S) # NxMxM
|
||||||
|
|
||||||
|
Lpsi2 = dL_dpsi2[None,:,:]*_psi2
|
||||||
|
Lpsi2sum = np.einsum('nmo->n',Lpsi2) #N
|
||||||
|
Lpsi2Z = np.einsum('nmo,oq->nq',Lpsi2,Z) #NxQ
|
||||||
|
Lpsi2Z2 = np.einsum('nmo,oq,oq->nq',Lpsi2,Z,Z) #NxQ
|
||||||
|
Lpsi2Z2p = np.einsum('nmo,mq,oq->nq',Lpsi2,Z,Z) #NxQ
|
||||||
|
Lpsi2Zhat = Lpsi2Z
|
||||||
|
Lpsi2Zhat2 = (Lpsi2Z2+Lpsi2Z2p)/2
|
||||||
|
|
||||||
|
_dL_dvar = Lpsi2sum.sum()*2/variance
|
||||||
|
_dL_dmu = (-2*denom) * (mu*Lpsi2sum[:,None]-Lpsi2Zhat)
|
||||||
|
_dL_dS = (2*np.square(denom))*(np.square(mu)*Lpsi2sum[:,None]-2*mu*Lpsi2Zhat+Lpsi2Zhat2) - denom*Lpsi2sum[:,None]
|
||||||
|
_dL_dZ = -np.einsum('nmo,oq->oq',Lpsi2,Z)/lengthscale2+np.einsum('nmo,oq->mq',Lpsi2,Z)/lengthscale2+ \
|
||||||
|
2*np.einsum('nmo,nq,nq->mq',Lpsi2,mu,denom) - np.einsum('nmo,nq,mq->mq',Lpsi2,denom,Z) - np.einsum('nmo,oq,nq->mq',Lpsi2,Z,denom)
|
||||||
|
_dL_dl = 2*lengthscale* ((S/lengthscale2*denom+np.square(mu*denom))*Lpsi2sum[:,None]+(Lpsi2Z2-Lpsi2Z2p)/(2*np.square(lengthscale2))-
|
||||||
|
(2*mu*denom2)*Lpsi2Zhat+denom2*Lpsi2Zhat2).sum(axis=0)
|
||||||
|
|
||||||
|
return _dL_dvar, _dL_dl, _dL_dZ, _dL_dmu, _dL_dS
|
||||||
|
|
||||||
|
_psi1computations = Cacher(__psi1computations, limit=1)
|
||||||
|
_psi2computations = Cacher(__psi2computations, limit=1)
|
||||||
411
GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
Normal file
411
GPy/kern/_src/psi_comp/rbf_psi_gpucomp.py
Normal file
|
|
@ -0,0 +1,411 @@
|
||||||
|
"""
|
||||||
|
The module for psi-statistics for RBF kernel
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from ....util.caching import Cache_this
|
||||||
|
from . import PSICOMP_RBF
|
||||||
|
from ....util import gpu_init
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pycuda.gpuarray as gpuarray
|
||||||
|
from pycuda.compiler import SourceModule
|
||||||
|
from ....util.linalg_gpu import sum_axis
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
gpu_code = """
|
||||||
|
// define THREADNUM
|
||||||
|
|
||||||
|
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
||||||
|
#define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
|
||||||
|
#define IDX_NQ(n,q) (q*N+n)
|
||||||
|
#define IDX_NM(n,m) (m*N+n)
|
||||||
|
#define IDX_MQ(m,q) (q*M+m)
|
||||||
|
#define IDX_MM(m1,m2) (m2*M+m1)
|
||||||
|
#define IDX_NQB(n,q,b) ((b*Q+q)*N+n)
|
||||||
|
#define IDX_QB(q,b) (b*Q+q)
|
||||||
|
|
||||||
|
// Divide data evenly
|
||||||
|
__device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) {
|
||||||
|
int residue = (total_data)%psize;
|
||||||
|
if(pidx<residue) {
|
||||||
|
int size = total_data/psize+1;
|
||||||
|
*start = size*pidx;
|
||||||
|
*end = *start+size;
|
||||||
|
} else {
|
||||||
|
int size = total_data/psize;
|
||||||
|
*start = size*pidx+residue;
|
||||||
|
*end = *start+size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__device__ void reduce_sum(double* array, int array_size) {
|
||||||
|
int s;
|
||||||
|
if(array_size >= blockDim.x) {
|
||||||
|
for(int i=blockDim.x+threadIdx.x; i<array_size; i+= blockDim.x) {
|
||||||
|
array[threadIdx.x] += array[i];
|
||||||
|
}
|
||||||
|
array_size = blockDim.x;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
for(int i=1; i<=array_size;i*=2) {s=i;}
|
||||||
|
if(threadIdx.x < array_size-s) {array[threadIdx.x] += array[s+threadIdx.x];}
|
||||||
|
__syncthreads();
|
||||||
|
for(s=s/2;s>=1;s=s/2) {
|
||||||
|
if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void compDenom(double *log_denom1, double *log_denom2, double *l, double *S, int N, int Q)
|
||||||
|
{
|
||||||
|
int n_start, n_end;
|
||||||
|
divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end);
|
||||||
|
|
||||||
|
for(int i=n_start*Q+threadIdx.x; i<n_end*Q; i+=blockDim.x) {
|
||||||
|
int n=i/Q;
|
||||||
|
int q=i%Q;
|
||||||
|
|
||||||
|
double Snq = S[IDX_NQ(n,q)];
|
||||||
|
double lq = l[q]*l[q];
|
||||||
|
log_denom1[IDX_NQ(n,q)] = log(Snq/lq+1.);
|
||||||
|
log_denom2[IDX_NQ(n,q)] = log(2.*Snq/lq+1.);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void psi1computations(double *psi1, double *log_denom1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||||
|
{
|
||||||
|
int m_start, m_end;
|
||||||
|
divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
|
||||||
|
|
||||||
|
for(int m=m_start; m<m_end; m++) {
|
||||||
|
for(int n=threadIdx.x; n<N; n+= blockDim.x) {
|
||||||
|
double log_psi1 = 0;
|
||||||
|
for(int q=0;q<Q;q++) {
|
||||||
|
double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
|
||||||
|
double Snq = S[IDX_NQ(n,q)];
|
||||||
|
double lq = l[q]*l[q];
|
||||||
|
log_psi1 += (muZ*muZ/(Snq+lq)+log_denom1[IDX_NQ(n,q)])/(-2.);
|
||||||
|
}
|
||||||
|
psi1[IDX_NM(n,m)] = var*exp(log_psi1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||||
|
{
|
||||||
|
int psi2_idx_start, psi2_idx_end;
|
||||||
|
__shared__ double psi2_local[THREADNUM];
|
||||||
|
divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end);
|
||||||
|
|
||||||
|
for(int psi2_idx=psi2_idx_start; psi2_idx<psi2_idx_end; psi2_idx++) {
|
||||||
|
int m1 = int((sqrt(8.*psi2_idx+1.)-1.)/2.);
|
||||||
|
int m2 = psi2_idx - (m1+1)*m1/2;
|
||||||
|
|
||||||
|
psi2_local[threadIdx.x] = 0;
|
||||||
|
for(int n=threadIdx.x;n<N;n+=blockDim.x) {
|
||||||
|
double log_psi2_n = 0;
|
||||||
|
for(int q=0;q<Q;q++) {
|
||||||
|
double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
|
||||||
|
double muZhat = mu[IDX_NQ(n,q)]- (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.;
|
||||||
|
double Snq = S[IDX_NQ(n,q)];
|
||||||
|
double lq = l[q]*l[q];
|
||||||
|
log_psi2_n += dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) + log_denom2[IDX_NQ(n,q)]/(-2.);
|
||||||
|
}
|
||||||
|
double exp_psi2_n = exp(log_psi2_n);
|
||||||
|
psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
|
||||||
|
if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
|
||||||
|
psi2_local[threadIdx.x] += exp_psi2_n;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(psi2_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {
|
||||||
|
psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0];
|
||||||
|
if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; }
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi1, double *psi1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||||
|
{
|
||||||
|
int m_start, m_end;
|
||||||
|
__shared__ double g_local[THREADNUM];
|
||||||
|
divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
|
||||||
|
int P = int(ceil(double(N)/THREADNUM));
|
||||||
|
|
||||||
|
double dvar_local = 0;
|
||||||
|
for(int q=0;q<Q;q++) {
|
||||||
|
double lq_sqrt = l[q];
|
||||||
|
double lq = lq_sqrt*lq_sqrt;
|
||||||
|
double dl_local = 0;
|
||||||
|
for(int p=0;p<P;p++) {
|
||||||
|
int n = p*THREADNUM + threadIdx.x;
|
||||||
|
double dmu_local = 0;
|
||||||
|
double dS_local = 0;
|
||||||
|
double Snq,mu_nq;
|
||||||
|
if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
|
||||||
|
for(int m=m_start; m<m_end; m++) {
|
||||||
|
if(n<N) {
|
||||||
|
double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
|
||||||
|
if(q==0) {dvar_local += lpsi1;}
|
||||||
|
|
||||||
|
double Zmu = Z[IDX_MQ(m,q)] - mu_nq;
|
||||||
|
double denom = Snq+lq;
|
||||||
|
double Zmu2_denom = Zmu*Zmu/denom;
|
||||||
|
|
||||||
|
dmu_local += lpsi1*Zmu/denom;
|
||||||
|
dS_local += lpsi1*(Zmu2_denom-1.)/denom;
|
||||||
|
dl_local += lpsi1*(Zmu2_denom+Snq/lq)/denom;
|
||||||
|
g_local[threadIdx.x] = -lpsi1*Zmu/denom;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dZ[IDX_MQ(m,q)] += g_local[0];}
|
||||||
|
}
|
||||||
|
if(n<N) {
|
||||||
|
dmu[IDX_NQB(n,q,blockIdx.x)] += dmu_local;
|
||||||
|
dS[IDX_NQB(n,q,blockIdx.x)] += dS_local/2.;
|
||||||
|
}
|
||||||
|
__threadfence_block();
|
||||||
|
}
|
||||||
|
g_local[threadIdx.x] = dl_local*lq_sqrt;
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
|
||||||
|
}
|
||||||
|
g_local[threadIdx.x] = dvar_local;
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dL_dpsi2, double *psi2n, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||||
|
{
|
||||||
|
int m_start, m_end;
|
||||||
|
__shared__ double g_local[THREADNUM];
|
||||||
|
divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
|
||||||
|
int P = int(ceil(double(N)/THREADNUM));
|
||||||
|
|
||||||
|
double dvar_local = 0;
|
||||||
|
for(int q=0;q<Q;q++) {
|
||||||
|
double lq_sqrt = l[q];
|
||||||
|
double lq = lq_sqrt*lq_sqrt;
|
||||||
|
double dl_local = 0;
|
||||||
|
for(int p=0;p<P;p++) {
|
||||||
|
int n = p*THREADNUM + threadIdx.x;
|
||||||
|
double dmu_local = 0;
|
||||||
|
double dS_local = 0;
|
||||||
|
double Snq,mu_nq;
|
||||||
|
if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)];}
|
||||||
|
for(int m1=m_start; m1<m_end; m1++) {
|
||||||
|
g_local[threadIdx.x] = 0;
|
||||||
|
for(int m2=0;m2<M;m2++) {
|
||||||
|
if(n<N) {
|
||||||
|
double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
|
||||||
|
if(q==0) {dvar_local += lpsi2;}
|
||||||
|
|
||||||
|
double dZ = Z[IDX_MQ(m1,q)] - Z[IDX_MQ(m2,q)];
|
||||||
|
double muZhat = mu_nq - (Z[IDX_MQ(m1,q)] + Z[IDX_MQ(m2,q)])/2.;
|
||||||
|
double denom = 2.*Snq+lq;
|
||||||
|
double muZhat2_denom = muZhat*muZhat/denom;
|
||||||
|
|
||||||
|
dmu_local += lpsi2*muZhat/denom;
|
||||||
|
dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom;
|
||||||
|
dl_local += lpsi2*((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq));
|
||||||
|
g_local[threadIdx.x] += 2.*lpsi2*(muZhat/denom-dZ/(2*lq));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
|
||||||
|
}
|
||||||
|
if(n<N) {
|
||||||
|
dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
|
||||||
|
dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
|
||||||
|
}
|
||||||
|
__threadfence_block();
|
||||||
|
}
|
||||||
|
g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
|
||||||
|
}
|
||||||
|
g_local[threadIdx.x] = dvar_local;
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
class PSICOMP_RBF_GPU(PSICOMP_RBF):
|
||||||
|
|
||||||
|
def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
|
||||||
|
self.GPU_direct = GPU_direct
|
||||||
|
self.gpuCache = None
|
||||||
|
|
||||||
|
self.threadnum = threadnum
|
||||||
|
self.blocknum = blocknum
|
||||||
|
module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
|
||||||
|
self.g_psi1computations = module.get_function('psi1computations')
|
||||||
|
self.g_psi1computations.prepare('PPdPPPPiii')
|
||||||
|
self.g_psi2computations = module.get_function('psi2computations')
|
||||||
|
self.g_psi2computations.prepare('PPPdPPPPiii')
|
||||||
|
self.g_psi1compDer = module.get_function('psi1compDer')
|
||||||
|
self.g_psi1compDer.prepare('PPPPPPPdPPPPiii')
|
||||||
|
self.g_psi2compDer = module.get_function('psi2compDer')
|
||||||
|
self.g_psi2compDer.prepare('PPPPPPPdPPPPiii')
|
||||||
|
self.g_compDenom = module.get_function('compDenom')
|
||||||
|
self.g_compDenom.prepare('PPPPii')
|
||||||
|
|
||||||
|
def __deepcopy__(self, memo):
|
||||||
|
s = PSICOMP_RBF_GPU(threadnum=self.threadnum, blocknum=self.blocknum, GPU_direct=self.GPU_direct)
|
||||||
|
memo[id(self)] = s
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _initGPUCache(self, N, M, Q):
|
||||||
|
if self.gpuCache == None:
|
||||||
|
self.gpuCache = {
|
||||||
|
'l_gpu' :gpuarray.empty((Q,),np.float64,order='F'),
|
||||||
|
'Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'),
|
||||||
|
'mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
|
'S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
|
'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
|
||||||
|
'psi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'),
|
||||||
|
'psi2n_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'),
|
||||||
|
'dL_dpsi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
|
||||||
|
'dL_dpsi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'),
|
||||||
|
'log_denom1_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
|
'log_denom2_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
|
# derivatives
|
||||||
|
'dvar_gpu' :gpuarray.empty((self.blocknum,),np.float64, order='F'),
|
||||||
|
'dl_gpu' :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
|
||||||
|
'dZ_gpu' :gpuarray.empty((M,Q),np.float64, order='F'),
|
||||||
|
'dmu_gpu' :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
|
||||||
|
'dS_gpu' :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
|
||||||
|
# grad
|
||||||
|
'grad_l_gpu' :gpuarray.empty((Q,),np.float64, order='F'),
|
||||||
|
'grad_mu_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
||||||
|
'grad_S_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
||||||
|
}
|
||||||
|
else:
|
||||||
|
assert N==self.gpuCache['mu_gpu'].shape[0]
|
||||||
|
assert M==self.gpuCache['Z_gpu'].shape[0]
|
||||||
|
assert Q==self.gpuCache['l_gpu'].shape[0]
|
||||||
|
|
||||||
|
def sync_params(self, lengthscale, Z, mu, S):
|
||||||
|
if len(lengthscale)==1:
|
||||||
|
self.gpuCache['l_gpu'].fill(lengthscale)
|
||||||
|
else:
|
||||||
|
self.gpuCache['l_gpu'].set(np.asfortranarray(lengthscale))
|
||||||
|
self.gpuCache['Z_gpu'].set(np.asfortranarray(Z))
|
||||||
|
self.gpuCache['mu_gpu'].set(np.asfortranarray(mu))
|
||||||
|
self.gpuCache['S_gpu'].set(np.asfortranarray(S))
|
||||||
|
N,Q = self.gpuCache['S_gpu'].shape
|
||||||
|
# t=self.g_compDenom(self.gpuCache['log_denom1_gpu'],self.gpuCache['log_denom2_gpu'],self.gpuCache['l_gpu'],self.gpuCache['S_gpu'], np.int32(N), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
|
||||||
|
# print 'g_compDenom '+str(t)
|
||||||
|
self.g_compDenom.prepared_call((self.blocknum,1),(self.threadnum,1,1), self.gpuCache['log_denom1_gpu'].gpudata,self.gpuCache['log_denom2_gpu'].gpudata,self.gpuCache['l_gpu'].gpudata,self.gpuCache['S_gpu'].gpudata, np.int32(N), np.int32(Q))
|
||||||
|
|
||||||
|
def reset_derivative(self):
|
||||||
|
self.gpuCache['dvar_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dl_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dZ_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dmu_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dS_gpu'].fill(0.)
|
||||||
|
self.gpuCache['grad_l_gpu'].fill(0.)
|
||||||
|
self.gpuCache['grad_mu_gpu'].fill(0.)
|
||||||
|
self.gpuCache['grad_S_gpu'].fill(0.)
|
||||||
|
|
||||||
|
def get_dimensions(self, Z, variational_posterior):
|
||||||
|
return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
|
||||||
|
|
||||||
|
@Cache_this(limit=1, ignore_args=(0,))
|
||||||
|
def psicomputations(self, variance, lengthscale, Z, variational_posterior):
|
||||||
|
"""
|
||||||
|
Z - MxQ
|
||||||
|
mu - NxQ
|
||||||
|
S - NxQ
|
||||||
|
"""
|
||||||
|
N,M,Q = self.get_dimensions(Z, variational_posterior)
|
||||||
|
self._initGPUCache(N,M,Q)
|
||||||
|
self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance)
|
||||||
|
|
||||||
|
psi1_gpu = self.gpuCache['psi1_gpu']
|
||||||
|
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||||
|
psi2n_gpu = self.gpuCache['psi2n_gpu']
|
||||||
|
l_gpu = self.gpuCache['l_gpu']
|
||||||
|
Z_gpu = self.gpuCache['Z_gpu']
|
||||||
|
mu_gpu = self.gpuCache['mu_gpu']
|
||||||
|
S_gpu = self.gpuCache['S_gpu']
|
||||||
|
log_denom1_gpu = self.gpuCache['log_denom1_gpu']
|
||||||
|
log_denom2_gpu = self.gpuCache['log_denom2_gpu']
|
||||||
|
|
||||||
|
psi0 = np.empty((N,))
|
||||||
|
psi0[:] = variance
|
||||||
|
self.g_psi1computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi1_gpu.gpudata, log_denom1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
|
||||||
|
self.g_psi2computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi2_gpu.gpudata, psi2n_gpu.gpudata, log_denom2_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
|
||||||
|
# t = self.g_psi1computations(psi1_gpu, log_denom1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
|
||||||
|
# print 'g_psi1computations '+str(t)
|
||||||
|
# t = self.g_psi2computations(psi2_gpu, psi2n_gpu, log_denom2_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
|
||||||
|
# print 'g_psi2computations '+str(t)
|
||||||
|
|
||||||
|
if self.GPU_direct:
|
||||||
|
return psi0, psi1_gpu, psi2_gpu
|
||||||
|
else:
|
||||||
|
return psi0, psi1_gpu.get(), psi2_gpu.get()
|
||||||
|
|
||||||
|
@Cache_this(limit=1, ignore_args=(0,1,2,3))
|
||||||
|
def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
||||||
|
ARD = (len(lengthscale)!=1)
|
||||||
|
|
||||||
|
N,M,Q = self.get_dimensions(Z, variational_posterior)
|
||||||
|
psi1_gpu = self.gpuCache['psi1_gpu']
|
||||||
|
psi2n_gpu = self.gpuCache['psi2n_gpu']
|
||||||
|
l_gpu = self.gpuCache['l_gpu']
|
||||||
|
Z_gpu = self.gpuCache['Z_gpu']
|
||||||
|
mu_gpu = self.gpuCache['mu_gpu']
|
||||||
|
S_gpu = self.gpuCache['S_gpu']
|
||||||
|
dvar_gpu = self.gpuCache['dvar_gpu']
|
||||||
|
dl_gpu = self.gpuCache['dl_gpu']
|
||||||
|
dZ_gpu = self.gpuCache['dZ_gpu']
|
||||||
|
dmu_gpu = self.gpuCache['dmu_gpu']
|
||||||
|
dS_gpu = self.gpuCache['dS_gpu']
|
||||||
|
grad_l_gpu = self.gpuCache['grad_l_gpu']
|
||||||
|
grad_mu_gpu = self.gpuCache['grad_mu_gpu']
|
||||||
|
grad_S_gpu = self.gpuCache['grad_S_gpu']
|
||||||
|
|
||||||
|
if self.GPU_direct:
|
||||||
|
dL_dpsi1_gpu = dL_dpsi1
|
||||||
|
dL_dpsi2_gpu = dL_dpsi2
|
||||||
|
dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get()
|
||||||
|
else:
|
||||||
|
dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
|
||||||
|
dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
|
||||||
|
dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
|
||||||
|
dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
|
||||||
|
dL_dpsi0_sum = dL_dpsi0.sum()
|
||||||
|
|
||||||
|
self.reset_derivative()
|
||||||
|
# t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
|
||||||
|
# print 'g_psi1compDer '+str(t)
|
||||||
|
# t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
|
||||||
|
# print 'g_psi2compDer '+str(t)
|
||||||
|
self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
|
||||||
|
self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
|
||||||
|
|
||||||
|
dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get()
|
||||||
|
sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
|
||||||
|
dL_dmu = grad_mu_gpu.get()
|
||||||
|
sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
|
||||||
|
dL_dS = grad_S_gpu.get()
|
||||||
|
dL_dZ = dZ_gpu.get()
|
||||||
|
if ARD:
|
||||||
|
sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
|
||||||
|
dL_dlengscale = grad_l_gpu.get()
|
||||||
|
else:
|
||||||
|
dL_dlengscale = gpuarray.sum(dl_gpu).get()
|
||||||
|
|
||||||
|
return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS
|
||||||
|
|
||||||
|
|
||||||
88
GPy/kern/_src/psi_comp/sslinear_psi_comp.py
Normal file
88
GPy/kern/_src/psi_comp/sslinear_psi_comp.py
Normal file
|
|
@ -0,0 +1,88 @@
|
||||||
|
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
||||||
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
"""
|
||||||
|
The package for the Psi statistics computation of the linear kernel for SSGPLVM
|
||||||
|
"""
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
def psicomputations(variance, Z, variational_posterior):
|
||||||
|
"""
|
||||||
|
Compute psi-statistics for ss-linear kernel
|
||||||
|
"""
|
||||||
|
# here are the "statistics" for psi0, psi1 and psi2
|
||||||
|
# Produced intermediate results:
|
||||||
|
# psi0 N
|
||||||
|
# psi1 NxM
|
||||||
|
# psi2 MxM
|
||||||
|
mu = variational_posterior.mean
|
||||||
|
S = variational_posterior.variance
|
||||||
|
gamma = variational_posterior.binary_prob
|
||||||
|
|
||||||
|
psi0 = np.einsum('q,nq,nq->n',variance,gamma,np.square(mu)+S)
|
||||||
|
psi1 = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu)
|
||||||
|
mu2 = np.square(mu)
|
||||||
|
variances2 = np.square(variance)
|
||||||
|
tmp = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu)
|
||||||
|
psi2 = np.einsum('nq,q,mq,oq,nq->mo',gamma,variances2,Z,Z,mu2+S)+\
|
||||||
|
np.einsum('nm,no->mo',tmp,tmp) - np.einsum('nq,q,mq,oq,nq->mo',np.square(gamma),variances2,Z,Z,mu2)
|
||||||
|
|
||||||
|
return psi0, psi1, psi2
|
||||||
|
|
||||||
|
def psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, Z, variational_posterior):
|
||||||
|
mu = variational_posterior.mean
|
||||||
|
S = variational_posterior.variance
|
||||||
|
gamma = variational_posterior.binary_prob
|
||||||
|
|
||||||
|
dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ = _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma)
|
||||||
|
|
||||||
|
# Compute for psi0 and psi1
|
||||||
|
mu2S = np.square(mu)+S
|
||||||
|
dL_dvar += np.einsum('n,nq,nq->q',dL_dpsi0,gamma,mu2S) + np.einsum('nm,nq,mq,nq->q',dL_dpsi1,gamma,Z,mu)
|
||||||
|
dL_dgamma += np.einsum('n,q,nq->nq',dL_dpsi0,variance,mu2S) + np.einsum('nm,q,mq,nq->nq',dL_dpsi1,variance,Z,mu)
|
||||||
|
dL_dmu += np.einsum('n,nq,q,nq->nq',dL_dpsi0,gamma,2.*variance,mu) + np.einsum('nm,nq,q,mq->nq',dL_dpsi1,gamma,variance,Z)
|
||||||
|
dL_dS += np.einsum('n,nq,q->nq',dL_dpsi0,gamma,variance)
|
||||||
|
dL_dZ += np.einsum('nm,nq,q,nq->mq',dL_dpsi1,gamma, variance,mu)
|
||||||
|
|
||||||
|
return dL_dvar, dL_dZ, dL_dmu, dL_dS, dL_dgamma
|
||||||
|
|
||||||
|
def _psi2computations(dL_dpsi2, variance, Z, mu, S, gamma):
|
||||||
|
"""
|
||||||
|
Z - MxQ
|
||||||
|
mu - NxQ
|
||||||
|
S - NxQ
|
||||||
|
gamma - NxQ
|
||||||
|
"""
|
||||||
|
# here are the "statistics" for psi1 and psi2
|
||||||
|
# Produced intermediate results:
|
||||||
|
# _psi2_dvariance Q
|
||||||
|
# _psi2_dZ MxQ
|
||||||
|
# _psi2_dgamma NxQ
|
||||||
|
# _psi2_dmu NxQ
|
||||||
|
# _psi2_dS NxQ
|
||||||
|
|
||||||
|
mu2 = np.square(mu)
|
||||||
|
gamma2 = np.square(gamma)
|
||||||
|
variance2 = np.square(variance)
|
||||||
|
mu2S = mu2+S # NxQ
|
||||||
|
common_sum = np.einsum('nq,q,mq,nq->nm',gamma,variance,Z,mu) # NxM
|
||||||
|
|
||||||
|
dL_dvar = np.einsum('mo,nq,q,mq,oq->q',dL_dpsi2,2.*(gamma*mu2S-gamma2*mu2),variance,Z,Z)+\
|
||||||
|
np.einsum('mo,nq,mq,nq,no->q',dL_dpsi2,gamma,Z,mu,common_sum)+\
|
||||||
|
np.einsum('mo,nq,oq,nq,nm->q',dL_dpsi2,gamma,Z,mu,common_sum)
|
||||||
|
|
||||||
|
dL_dgamma = np.einsum('mo,q,mq,oq,nq->nq',dL_dpsi2,variance2,Z,Z,(mu2S-2.*gamma*mu2))+\
|
||||||
|
np.einsum('mo,q,mq,nq,no->nq',dL_dpsi2,variance,Z,mu,common_sum)+\
|
||||||
|
np.einsum('mo,q,oq,nq,nm->nq',dL_dpsi2,variance,Z,mu,common_sum)
|
||||||
|
|
||||||
|
dL_dmu = np.einsum('mo,q,mq,oq,nq,nq->nq',dL_dpsi2,variance2,Z,Z,mu,2.*(gamma-gamma2))+\
|
||||||
|
np.einsum('mo,nq,q,mq,no->nq',dL_dpsi2,gamma,variance,Z,common_sum)+\
|
||||||
|
np.einsum('mo,nq,q,oq,nm->nq',dL_dpsi2,gamma,variance,Z,common_sum)
|
||||||
|
|
||||||
|
dL_dS = np.einsum('mo,nq,q,mq,oq->nq',dL_dpsi2,gamma,variance2,Z,Z)
|
||||||
|
|
||||||
|
dL_dZ = 2.*(np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma,variance2,Z,mu2S)+np.einsum('om,nq,q,nq,nm->oq',dL_dpsi2,gamma,variance,mu,common_sum)
|
||||||
|
-np.einsum('om,nq,q,mq,nq->oq',dL_dpsi2,gamma2,variance2,Z,mu2))
|
||||||
|
|
||||||
|
return dL_dvar, dL_dgamma, dL_dmu, dL_dS, dL_dZ
|
||||||
|
|
@ -6,10 +6,8 @@ The package for the psi statistics computation
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from GPy.util.caching import Cache_this
|
|
||||||
|
|
||||||
@Cache_this(limit=1)
|
def psicomputations(variance, lengthscale, Z, variational_posterior):
|
||||||
def psicomputations(variance, lengthscale, Z, mu, S, gamma):
|
|
||||||
"""
|
"""
|
||||||
Z - MxQ
|
Z - MxQ
|
||||||
mu - NxQ
|
mu - NxQ
|
||||||
|
|
@ -19,6 +17,9 @@ def psicomputations(variance, lengthscale, Z, mu, S, gamma):
|
||||||
# here are the "statistics" for psi0, psi1 and psi2
|
# here are the "statistics" for psi0, psi1 and psi2
|
||||||
# Produced intermediate results:
|
# Produced intermediate results:
|
||||||
# _psi1 NxM
|
# _psi1 NxM
|
||||||
|
mu = variational_posterior.mean
|
||||||
|
S = variational_posterior.variance
|
||||||
|
gamma = variational_posterior.binary_prob
|
||||||
|
|
||||||
psi0 = np.empty(mu.shape[0])
|
psi0 = np.empty(mu.shape[0])
|
||||||
psi0[:] = variance
|
psi0[:] = variance
|
||||||
|
|
@ -37,7 +38,6 @@ def _psi1computations(variance, lengthscale, Z, mu, S, gamma):
|
||||||
# Produced intermediate results:
|
# Produced intermediate results:
|
||||||
# _psi1 NxM
|
# _psi1 NxM
|
||||||
|
|
||||||
|
|
||||||
lengthscale2 = np.square(lengthscale)
|
lengthscale2 = np.square(lengthscale)
|
||||||
|
|
||||||
# psi1
|
# psi1
|
||||||
|
|
@ -147,11 +147,6 @@ def _psi1compDer(dL_dpsi1, variance, lengthscale, Z, mu, S, gamma):
|
||||||
_dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2. # NxQ
|
_dL_dS = np.einsum('nm,nmq,nmq,nq,nmq->nq',dL_dpsi1,_psi1_q,_psi1_exp_dist_sq,_psi1_common,(_psi1_dist_sq-1.))/2. # NxQ
|
||||||
_dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
|
_dL_dZ = np.einsum('nm,nmq,nmq->mq',dL_dpsi1,_psi1_q, (- _psi1_common[:,None,:] * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z))
|
||||||
_dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
|
_dL_dlengthscale = lengthscale* np.einsum('nm,nmq,nmq->q',dL_dpsi1,_psi1_q,(_psi1_common[:,None,:]*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + (1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z))
|
||||||
|
|
||||||
# _dpsi1_dmu = _psi1_q * (_psi1_exp_dist_sq * _psi1_dist * _psi1_common) # NxMxQ
|
|
||||||
# _dpsi1_dS = _psi1_q * (_psi1_exp_dist_sq * _psi1_common * 0.5 * (_psi1_dist_sq - 1.)) # NxMxQ
|
|
||||||
# _dpsi1_dZ = _psi1_q * (- _psi1_common * _psi1_dist * _psi1_exp_dist_sq - (1-gamma[:,None,:])/lengthscale2*Z[None,:,:]*_psi1_exp_Z) # NxMxQ
|
|
||||||
# _dpsi1_dlengthscale = 2.*lengthscale*_psi1_q * (0.5*_psi1_common*(S[:,None,:]/lengthscale2+_psi1_dist_sq)*_psi1_exp_dist_sq + 0.5*(1-gamma[:,None,:])*np.square(Z[None,:,:]/lengthscale2)*_psi1_exp_Z) # NxMxQ
|
|
||||||
|
|
||||||
return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
|
return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
|
||||||
|
|
||||||
|
|
@ -199,15 +194,6 @@ def _psi2compDer(dL_dpsi2, variance, lengthscale, Z, mu, S, gamma):
|
||||||
_dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
|
_dL_dmu = -2.*np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q,_psi2_common,_psi2_mudist,_psi2_exp_dist_sq)
|
||||||
_dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
|
_dL_dS = np.einsum('mo,nmoq,nq,nmoq,nmoq->nq',dL_dpsi2,_psi2_q, _psi2_common, (2.*_psi2_mudist_sq-1.), _psi2_exp_dist_sq)
|
||||||
_dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
|
_dL_dZ = 2.*np.einsum('mo,nmoq,nmoq->mq',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(-_psi2_Zdist*_psi2_denom[:,None,None,:]+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z))
|
||||||
# print _psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq #+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z)
|
|
||||||
_dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
|
_dL_dlengthscale = 2.*lengthscale* np.einsum('mo,nmoq,nmoq->q',dL_dpsi2,_psi2_q,(_psi2_common[:,None,None,:]*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom[:,None,None,:]+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z))
|
||||||
|
|
||||||
|
|
||||||
# _dpsi2_dvariance = 2. * _psi2/variance # NxMxM
|
|
||||||
# _dpsi2_dgamma = _psi2_q * (_psi2_exp_dist_sq/_psi2_denom_sqrt - _psi2_exp_Z) # NxMxMxQ
|
|
||||||
# _dpsi2_dmu = _psi2_q * (-2.*_psi2_common*_psi2_mudist * _psi2_exp_dist_sq) # NxMxMxQ
|
|
||||||
# _dpsi2_dS = _psi2_q * (_psi2_common * (2.*_psi2_mudist_sq - 1.) * _psi2_exp_dist_sq) # NxMxMxQ
|
|
||||||
# _dpsi2_dZ = 2.*_psi2_q * (_psi2_common*(-_psi2_Zdist*_psi2_denom+_psi2_mudist)*_psi2_exp_dist_sq - (1-gamma[:,None,None,:])*Z[:,None,:]/lengthscale2*_psi2_exp_Z) # NxMxMxQ
|
|
||||||
# _dpsi2_dlengthscale = 2.*lengthscale* _psi2_q * (_psi2_common*(S[:,None,None,:]/lengthscale2+_psi2_Zdist_sq*_psi2_denom+_psi2_mudist_sq)*_psi2_exp_dist_sq+(1-gamma[:,None,None,:])*_psi2_Z_sq_sum*0.5/lengthscale2*_psi2_exp_Z) # NxMxMxQ
|
|
||||||
|
|
||||||
return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
|
return _dL_dvariance, _dL_dlengthscale, _dL_dZ, _dL_dmu, _dL_dS, _dL_dgamma
|
||||||
|
|
|
||||||
|
|
@ -1,535 +1,474 @@
|
||||||
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
The package for the psi statistics computation on GPU
|
The module for psi-statistics for RBF kernel for Spike-and-Slab GPLVM
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from GPy.util.caching import Cache_this
|
from ....util.caching import Cache_this
|
||||||
|
from . import PSICOMP_RBF
|
||||||
from ....util import gpu_init
|
from ....util import gpu_init
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pycuda.gpuarray as gpuarray
|
import pycuda.gpuarray as gpuarray
|
||||||
from scikits.cuda import cublas
|
from pycuda.compiler import SourceModule
|
||||||
from pycuda.reduction import ReductionKernel
|
from ....util.linalg_gpu import sum_axis
|
||||||
from pycuda.elementwise import ElementwiseKernel
|
|
||||||
from ....util import linalg_gpu
|
|
||||||
|
|
||||||
|
|
||||||
# The kernel form computing psi1 het_noise
|
|
||||||
comp_psi1 = ElementwiseKernel(
|
|
||||||
"double *psi1, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q",
|
|
||||||
"psi1[i] = comp_psi1_element(var, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)",
|
|
||||||
"comp_psi1",
|
|
||||||
preamble="""
|
|
||||||
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
|
||||||
#define IDX_NQ(n,q) (q*N+n)
|
|
||||||
#define IDX_MQ(m,q) (q*M+m)
|
|
||||||
#define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b)))
|
|
||||||
|
|
||||||
__device__ double comp_psi1_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx)
|
|
||||||
{
|
|
||||||
int n = idx%N;
|
|
||||||
int m = idx/N;
|
|
||||||
double psi1_exp=0;
|
|
||||||
for(int q=0;q<Q;q++){
|
|
||||||
double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
|
|
||||||
double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi1denom[IDX_NQ(n,q)] + muZ*muZ/(S[IDX_NQ(n,q)]+l[q]) )/2.0;
|
|
||||||
double exp2 = log1Gamma[IDX_NQ(n,q)] - Z[IDX_MQ(m,q)]*Z[IDX_MQ(m,q)]/(l[q]*2.0);
|
|
||||||
psi1_exp += LOGEXPSUM(exp1,exp2);
|
|
||||||
}
|
|
||||||
return var*exp(psi1_exp);
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
# The kernel form computing psi2 het_noise
|
|
||||||
comp_psi2 = ElementwiseKernel(
|
|
||||||
"double *psi2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q",
|
|
||||||
"psi2[i] = comp_psi2_element(var, l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)",
|
|
||||||
"comp_psi2",
|
|
||||||
preamble="""
|
|
||||||
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
|
||||||
#define IDX_NQ(n,q) (q*N+n)
|
|
||||||
#define IDX_MQ(m,q) (q*M+m)
|
|
||||||
#define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b)))
|
|
||||||
|
|
||||||
__device__ double comp_psi2_element(double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx)
|
|
||||||
{
|
|
||||||
// psi2 (n,m1,m2)
|
|
||||||
int m2 = idx/(M*N);
|
|
||||||
int m1 = (idx%(M*N))/N;
|
|
||||||
int n = idx%N;
|
|
||||||
|
|
||||||
double psi2_exp=0;
|
|
||||||
for(int q=0;q<Q;q++){
|
|
||||||
double dZ = Z[IDX_MQ(m1,q)]-Z[IDX_MQ(m2,q)];
|
|
||||||
double muZ = mu[IDX_NQ(n,q)] - (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.0;
|
|
||||||
double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi2denom[IDX_NQ(n,q)])/2.0 - dZ*dZ/(l[q]*4.0) - muZ*muZ/(2*S[IDX_NQ(n,q)]+l[q]);
|
|
||||||
double exp2 = log1Gamma[IDX_NQ(n,q)] - (Z[IDX_MQ(m1,q)]*Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)]*Z[IDX_MQ(m2,q)])/(l[q]*2.0);
|
|
||||||
psi2_exp += LOGEXPSUM(exp1,exp2);
|
|
||||||
}
|
|
||||||
return var*var*exp(psi2_exp);
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
# compute psidenom
|
|
||||||
comp_logpsidenom = ElementwiseKernel(
|
|
||||||
"double *out, double *S, double *l, double scale, int N",
|
|
||||||
"out[i] = comp_logpsidenom_element(S, l, scale, N, i)",
|
|
||||||
"comp_logpsidenom",
|
|
||||||
preamble="""
|
|
||||||
__device__ double comp_logpsidenom_element(double *S, double *l, double scale, int N, int idx)
|
|
||||||
{
|
|
||||||
int q = idx/N;
|
|
||||||
|
|
||||||
return log(scale*S[idx]/l[q]+1.0);
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
# The kernel form computing psi1 het_noise
|
|
||||||
comp_dpsi1_dvar = ElementwiseKernel(
|
|
||||||
"double *dpsi1_dvar, double *psi1_neq, double *psi1exp1, double *psi1exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q",
|
|
||||||
"dpsi1_dvar[i] = comp_dpsi1_dvar_element(psi1_neq, psi1exp1, psi1exp2, l, Z, mu, S, logGamma, log1Gamma, logpsi1denom, N, M, Q, i)",
|
|
||||||
"comp_dpsi1_dvar",
|
|
||||||
preamble="""
|
|
||||||
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
|
||||||
#define IDX_NQ(n,q) (q*N+n)
|
|
||||||
#define IDX_MQ(m,q) (q*M+m)
|
|
||||||
#define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b)))
|
|
||||||
|
|
||||||
__device__ double comp_dpsi1_dvar_element(double *psi1_neq, double *psi1exp1, double *psi1exp2, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi1denom, int N, int M, int Q, int idx)
|
|
||||||
{
|
|
||||||
int n = idx%N;
|
|
||||||
int m = idx/N;
|
|
||||||
|
|
||||||
double psi1_sum = 0;
|
|
||||||
for(int q=0;q<Q;q++){
|
|
||||||
double muZ = mu[IDX_NQ(n,q)]-Z[IDX_MQ(m,q)];
|
|
||||||
double exp1_e = -(muZ*muZ/(S[IDX_NQ(n,q)]+l[q]) )/2.0;
|
|
||||||
double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi1denom[IDX_NQ(n,q)])/2.0 + exp1_e;
|
|
||||||
double exp2_e = - Z[IDX_MQ(m,q)]*Z[IDX_MQ(m,q)]/(l[q]*2.0);
|
|
||||||
double exp2 = log1Gamma[IDX_NQ(n,q)] + exp2_e;
|
|
||||||
double psi1_q = LOGEXPSUM(exp1,exp2);
|
|
||||||
psi1_neq[IDX_NMQ(n,m,q)] = -psi1_q;
|
|
||||||
psi1exp1[IDX_NMQ(n,m,q)] = exp(exp1_e);
|
|
||||||
psi1exp2[IDX_MQ(m,q)] = exp(exp2_e);
|
|
||||||
psi1_sum += psi1_q;
|
|
||||||
}
|
|
||||||
for(int q=0;q<Q;q++) {
|
|
||||||
psi1_neq[IDX_NMQ(n,m,q)] = exp(psi1_neq[IDX_NMQ(n,m,q)]+psi1_sum);
|
|
||||||
}
|
|
||||||
return exp(psi1_sum);
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
# The kernel form computing psi1 het_noise
|
|
||||||
comp_psi1_der = ElementwiseKernel(
|
|
||||||
"double *dpsi1_dl, double *dpsi1_dmu, double *dpsi1_dS, double *dpsi1_dgamma, double *dpsi1_dZ, double *psi1_neq, double *psi1exp1, double *psi1exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q",
|
|
||||||
"dpsi1_dl[i] = comp_psi1_der_element(dpsi1_dmu, dpsi1_dS, dpsi1_dgamma, dpsi1_dZ, psi1_neq, psi1exp1, psi1exp2, var, l, Z, mu, S, gamma, N, M, Q, i)",
|
|
||||||
"comp_psi1_der",
|
|
||||||
preamble="""
|
|
||||||
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
|
||||||
#define IDX_NQ(n,q) (q*N+n)
|
|
||||||
#define IDX_MQ(m,q) (q*M+m)
|
|
||||||
|
|
||||||
__device__ double comp_psi1_der_element(double *dpsi1_dmu, double *dpsi1_dS, double *dpsi1_dgamma, double *dpsi1_dZ, double *psi1_neq, double *psi1exp1, double *psi1exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q, int idx)
|
|
||||||
{
|
|
||||||
int q = idx/(M*N);
|
|
||||||
int m = (idx%(M*N))/N;
|
|
||||||
int n = idx%N;
|
|
||||||
|
|
||||||
double neq = psi1_neq[IDX_NMQ(n,m,q)];
|
|
||||||
double gamma_c = gamma[IDX_NQ(n,q)];
|
|
||||||
double Z_c = Z[IDX_MQ(m,q)];
|
|
||||||
double S_c = S[IDX_NQ(n,q)];
|
|
||||||
double l_c = l[q];
|
|
||||||
double l_sqrt_c = sqrt(l[q]);
|
|
||||||
double psi1exp1_c = psi1exp1[IDX_NMQ(n,m,q)];
|
|
||||||
double psi1exp2_c = psi1exp2[IDX_MQ(m,q)];
|
|
||||||
|
|
||||||
double denom = S_c/l_c+1.0;
|
|
||||||
double denom_sqrt = sqrt(denom);
|
|
||||||
double Zmu = Z_c-mu[IDX_NQ(n,q)];
|
|
||||||
double psi1_common = gamma_c/(denom_sqrt*denom*l_c);
|
|
||||||
double gamma1 = 1-gamma_c;
|
|
||||||
|
|
||||||
dpsi1_dgamma[IDX_NMQ(n,m,q)] = var*neq*(psi1exp1_c/denom_sqrt - psi1exp2_c);
|
|
||||||
dpsi1_dmu[IDX_NMQ(n,m,q)] = var*neq*(psi1_common*Zmu*psi1exp1_c);
|
|
||||||
dpsi1_dS[IDX_NMQ(n,m,q)] = var*neq*(psi1_common*(Zmu*Zmu/(S_c+l_c)-1.0)*psi1exp1_c)/2.0;
|
|
||||||
dpsi1_dZ[IDX_NMQ(n,m,q)] = var*neq*(-psi1_common*Zmu*psi1exp1_c-gamma1*Z_c/l_c*psi1exp2_c);
|
|
||||||
return var*neq*(psi1_common*(S_c/l_c+Zmu*Zmu/(S_c+l_c))*psi1exp1_c+gamma1*Z_c*Z_c/l_c*psi1exp2_c)*l_sqrt_c;
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
# The kernel form computing psi1 het_noise
|
|
||||||
comp_dpsi2_dvar = ElementwiseKernel(
|
|
||||||
"double *dpsi2_dvar, double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q",
|
|
||||||
"dpsi2_dvar[i] = comp_dpsi2_dvar_element(psi2_neq, psi2exp1, psi2exp2, var, l, Z, mu, S, logGamma, log1Gamma, logpsi2denom, N, M, Q, i)",
|
|
||||||
"comp_dpsi2_dvar",
|
|
||||||
preamble="""
|
|
||||||
#define IDX_NMMQ(n,m1,m2,q) (((q*M+m2)*M+m1)*N+n)
|
|
||||||
#define IDX_MMQ(m1,m2,q) ((q*M+m2)*M+m1)
|
|
||||||
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
|
||||||
#define IDX_NQ(n,q) (q*N+n)
|
|
||||||
#define IDX_MQ(m,q) (q*M+m)
|
|
||||||
#define LOGEXPSUM(a,b) (a>=b?a+log(1.0+exp(b-a)):b+log(1.0+exp(a-b)))
|
|
||||||
|
|
||||||
__device__ double comp_dpsi2_dvar_element(double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *logGamma, double *log1Gamma, double *logpsi2denom, int N, int M, int Q, int idx)
|
|
||||||
{
|
|
||||||
// psi2 (n,m1,m2)
|
|
||||||
int m2 = idx/(M*N);
|
|
||||||
int m1 = (idx%(M*N))/N;
|
|
||||||
int n = idx%N;
|
|
||||||
|
|
||||||
double psi2_sum=0;
|
|
||||||
for(int q=0;q<Q;q++){
|
|
||||||
double dZ = Z[IDX_MQ(m1,q)]-Z[IDX_MQ(m2,q)];
|
|
||||||
double muZ = mu[IDX_NQ(n,q)] - (Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)])/2.0;
|
|
||||||
double exp1_e = - dZ*dZ/(l[q]*4.0) - muZ*muZ/(2*S[IDX_NQ(n,q)]+l[q]);
|
|
||||||
double exp1 = logGamma[IDX_NQ(n,q)] - (logpsi2denom[IDX_NQ(n,q)])/2.0 +exp1_e;
|
|
||||||
double exp2_e = - (Z[IDX_MQ(m1,q)]*Z[IDX_MQ(m1,q)]+Z[IDX_MQ(m2,q)]*Z[IDX_MQ(m2,q)])/(l[q]*2.0);
|
|
||||||
double exp2 = log1Gamma[IDX_NQ(n,q)] + exp2_e;
|
|
||||||
double psi2_q = LOGEXPSUM(exp1,exp2);
|
|
||||||
psi2_neq[IDX_NMMQ(n,m1,m2,q)] = -psi2_q;
|
|
||||||
psi2exp1[IDX_NMMQ(n,m1,m2,q)] = exp(exp1_e);
|
|
||||||
psi2exp2[IDX_MMQ(m1,m2,q)] = exp(exp2_e);
|
|
||||||
psi2_sum += psi2_q;
|
|
||||||
}
|
|
||||||
for(int q=0;q<Q;q++) {
|
|
||||||
psi2_neq[IDX_NMMQ(n,m1,m2,q)] = exp(psi2_neq[IDX_NMMQ(n,m1,m2,q)]+psi2_sum);
|
|
||||||
}
|
|
||||||
return 2*var*exp(psi2_sum);
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
# The kernel form computing psi1 het_noise
|
|
||||||
comp_psi2_der = ElementwiseKernel(
|
|
||||||
"double *dpsi2_dl, double *dpsi2_dmu, double *dpsi2_dS, double *dpsi2_dgamma, double *dpsi2_dZ, double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q",
|
|
||||||
"dpsi2_dl[i] = comp_psi2_der_element(dpsi2_dmu, dpsi2_dS, dpsi2_dgamma, dpsi2_dZ, psi2_neq, psi2exp1, psi2exp2, var, l, Z, mu, S, gamma, N, M, Q, i)",
|
|
||||||
"comp_psi2_der",
|
|
||||||
preamble="""
|
|
||||||
#define IDX_NMMQ(n,m1,m2,q) (((q*M+m2)*M+m1)*N+n)
|
|
||||||
#define IDX_MMQ(m1,m2,q) ((q*M+m2)*M+m1)
|
|
||||||
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
|
||||||
#define IDX_NQ(n,q) (q*N+n)
|
|
||||||
#define IDX_MQ(m,q) (q*M+m)
|
|
||||||
|
|
||||||
__device__ double comp_psi2_der_element(double *dpsi2_dmu, double *dpsi2_dS, double *dpsi2_dgamma, double *dpsi2_dZ, double *psi2_neq, double *psi2exp1, double *psi2exp2, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q, int idx)
|
|
||||||
{
|
|
||||||
// dpsi2 (n,m1,m2,q)
|
|
||||||
int q = idx/(M*M*N);
|
|
||||||
int m2 = (idx%(M*M*N))/(M*N);
|
|
||||||
int m1 = (idx%(M*N))/N;
|
|
||||||
int n = idx%N;
|
|
||||||
|
|
||||||
double neq = psi2_neq[IDX_NMMQ(n,m1,m2,q)];
|
|
||||||
double gamma_c = gamma[IDX_NQ(n,q)];
|
|
||||||
double Z1_c = Z[IDX_MQ(m1,q)];
|
|
||||||
double Z2_c = Z[IDX_MQ(m2,q)];
|
|
||||||
double S_c = S[IDX_NQ(n,q)];
|
|
||||||
double l_c = l[q];
|
|
||||||
double l_sqrt_c = sqrt(l[q]);
|
|
||||||
double psi2exp1_c = psi2exp1[IDX_NMMQ(n,m1,m2,q)];
|
|
||||||
double psi2exp2_c = psi2exp2[IDX_MMQ(m1,m2,q)];
|
|
||||||
|
|
||||||
double dZ = Z2_c - Z1_c;
|
|
||||||
double muZ = mu[IDX_NQ(n,q)] - (Z1_c+Z2_c)/2.0;
|
|
||||||
double Z2 = Z1_c*Z1_c+Z2_c*Z2_c;
|
|
||||||
double denom = 2.0*S_c/l_c+1.0;
|
|
||||||
double denom_sqrt = sqrt(denom);
|
|
||||||
double psi2_common = gamma_c/(denom_sqrt*denom*l_c);
|
|
||||||
double gamma1 = 1-gamma_c;
|
|
||||||
double var2 = var*var;
|
|
||||||
|
|
||||||
dpsi2_dgamma[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2exp1_c/denom_sqrt - psi2exp2_c);
|
|
||||||
dpsi2_dmu[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(-2.0*psi2_common*muZ*psi2exp1_c);
|
|
||||||
dpsi2_dS[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(2.0*muZ*muZ/(2.0*S_c+l_c)-1.0)*psi2exp1_c);
|
|
||||||
dpsi2_dZ[IDX_NMMQ(n,m1,m2,q)] = var2*neq*(psi2_common*(dZ*denom/-2.0+muZ)*psi2exp1_c-gamma1*Z2_c/l_c*psi2exp2_c)*2.0;
|
|
||||||
return var2*neq*(psi2_common*(S_c/l_c+dZ*dZ*denom/(4.0*l_c)+muZ*muZ/(2.0*S_c+l_c))*psi2exp1_c+gamma1*Z2/(2.0*l_c)*psi2exp2_c)*l_sqrt_c*2.0;
|
|
||||||
}
|
|
||||||
""")
|
|
||||||
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
class PSICOMP_SSRBF(object):
|
gpu_code = """
|
||||||
def __init__(self):
|
// define THREADNUM
|
||||||
assert gpu_init.initSuccess, "GPU initialization failed!"
|
|
||||||
self.cublas_handle = gpu_init.cublas_handle
|
#define IDX_NMQ(n,m,q) ((q*M+m)*N+n)
|
||||||
self.gpuCache = None
|
#define IDX_NMM(n,m1,m2) ((m2*M+m1)*N+n)
|
||||||
self.gpuCacheAll = None
|
#define IDX_NQ(n,q) (q*N+n)
|
||||||
|
#define IDX_NM(n,m) (m*N+n)
|
||||||
|
#define IDX_MQ(m,q) (q*M+m)
|
||||||
|
#define IDX_MM(m1,m2) (m2*M+m1)
|
||||||
|
#define IDX_NQB(n,q,b) ((b*Q+q)*N+n)
|
||||||
|
#define IDX_QB(q,b) (b*Q+q)
|
||||||
|
|
||||||
|
// Divide data evenly
|
||||||
|
__device__ void divide_data(int total_data, int psize, int pidx, int *start, int *end) {
|
||||||
|
int residue = (total_data)%psize;
|
||||||
|
if(pidx<residue) {
|
||||||
|
int size = total_data/psize+1;
|
||||||
|
*start = size*pidx;
|
||||||
|
*end = *start+size;
|
||||||
|
} else {
|
||||||
|
int size = total_data/psize;
|
||||||
|
*start = size*pidx+residue;
|
||||||
|
*end = *start+size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
def _initGPUCache(self, N, M, Q):
|
__device__ void reduce_sum(double* array, int array_size) {
|
||||||
if self.gpuCache!=None and self.gpuCache['mu_gpu'].shape[0] == N:
|
int s;
|
||||||
return
|
if(array_size >= blockDim.x) {
|
||||||
|
for(int i=blockDim.x+threadIdx.x; i<array_size; i+= blockDim.x) {
|
||||||
|
array[threadIdx.x] += array[i];
|
||||||
|
}
|
||||||
|
array_size = blockDim.x;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
for(int i=1; i<=array_size;i*=2) {s=i;}
|
||||||
|
if(threadIdx.x < array_size-s) {array[threadIdx.x] += array[s+threadIdx.x];}
|
||||||
|
__syncthreads();
|
||||||
|
for(s=s/2;s>=1;s=s/2) {
|
||||||
|
if(threadIdx.x < s) {array[threadIdx.x] += array[s+threadIdx.x];}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void compDenom(double *log_denom1, double *log_denom2, double *log_gamma, double*log_gamma1, double *gamma, double *l, double *S, int N, int Q)
|
||||||
|
{
|
||||||
|
int n_start, n_end;
|
||||||
|
divide_data(N, gridDim.x, blockIdx.x, &n_start, &n_end);
|
||||||
|
|
||||||
if self.gpuCacheAll!=None and self.gpuCacheAll['mu_gpu'].shape[0]<N: # Too small cache -> reallocate
|
for(int i=n_start*Q+threadIdx.x; i<n_end*Q; i+=blockDim.x) {
|
||||||
self._releaseMemory()
|
int n=i/Q;
|
||||||
|
int q=i%Q;
|
||||||
|
|
||||||
|
double Snq = S[IDX_NQ(n,q)];
|
||||||
|
double lq = l[q]*l[q];
|
||||||
|
double gnq = gamma[IDX_NQ(n,q)];
|
||||||
|
log_denom1[IDX_NQ(n,q)] = log(Snq/lq+1.);
|
||||||
|
log_denom2[IDX_NQ(n,q)] = log(2.*Snq/lq+1.);
|
||||||
|
log_gamma[IDX_NQ(n,q)] = log(gnq);
|
||||||
|
log_gamma1[IDX_NQ(n,q)] = log(1.-gnq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void psi1computations(double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||||
|
{
|
||||||
|
int m_start, m_end;
|
||||||
|
divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
|
||||||
|
|
||||||
|
for(int m=m_start; m<m_end; m++) {
|
||||||
|
for(int n=threadIdx.x; n<N; n+= blockDim.x) {
|
||||||
|
double log_psi1 = 0;
|
||||||
|
for(int q=0;q<Q;q++) {
|
||||||
|
double Zmq = Z[IDX_MQ(m,q)];
|
||||||
|
double muZ = mu[IDX_NQ(n,q)]-Zmq;
|
||||||
|
double Snq = S[IDX_NQ(n,q)];
|
||||||
|
double lq = l[q]*l[q];
|
||||||
|
double exp1 = log_gamma[IDX_NQ(n,q)]-(muZ*muZ/(Snq+lq)+log_denom1[IDX_NQ(n,q)])/(2.);
|
||||||
|
double exp2 = log_gamma1[IDX_NQ(n,q)]-Zmq*Zmq/(2.*lq);
|
||||||
|
log_psi1 += (exp1>exp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2));
|
||||||
|
}
|
||||||
|
psi1[IDX_NM(n,m)] = var*exp(log_psi1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void psi2computations(double *psi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, int N, int M, int Q)
|
||||||
|
{
|
||||||
|
int psi2_idx_start, psi2_idx_end;
|
||||||
|
__shared__ double psi2_local[THREADNUM];
|
||||||
|
divide_data((M+1)*M/2, gridDim.x, blockIdx.x, &psi2_idx_start, &psi2_idx_end);
|
||||||
|
|
||||||
|
for(int psi2_idx=psi2_idx_start; psi2_idx<psi2_idx_end; psi2_idx++) {
|
||||||
|
int m1 = int((sqrt(8.*psi2_idx+1.)-1.)/2.);
|
||||||
|
int m2 = psi2_idx - (m1+1)*m1/2;
|
||||||
|
|
||||||
if self.gpuCacheAll == None:
|
psi2_local[threadIdx.x] = 0;
|
||||||
self.gpuCacheAll = {
|
for(int n=threadIdx.x;n<N;n+=blockDim.x) {
|
||||||
|
double log_psi2_n = 0;
|
||||||
|
for(int q=0;q<Q;q++) {
|
||||||
|
double Zm1q = Z[IDX_MQ(m1,q)];
|
||||||
|
double Zm2q = Z[IDX_MQ(m2,q)];
|
||||||
|
double dZ = Zm1q - Zm2q;
|
||||||
|
double muZhat = mu[IDX_NQ(n,q)]- (Zm1q+Zm2q)/2.;
|
||||||
|
double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
|
||||||
|
double Snq = S[IDX_NQ(n,q)];
|
||||||
|
double lq = l[q]*l[q];
|
||||||
|
double exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_denom2[IDX_NQ(n,q)]/2. + log_gamma[IDX_NQ(n,q)];
|
||||||
|
double exp2 = log_gamma1[IDX_NQ(n,q)] - Z2/(2.*lq);
|
||||||
|
log_psi2_n += (exp1>exp2)?exp1+log1p(exp(exp2-exp1)):exp2+log1p(exp(exp1-exp2));
|
||||||
|
}
|
||||||
|
double exp_psi2_n = exp(log_psi2_n);
|
||||||
|
psi2n[IDX_NMM(n,m1,m2)] = var*var*exp_psi2_n;
|
||||||
|
if(m1!=m2) { psi2n[IDX_NMM(n,m2,m1)] = var*var*exp_psi2_n;}
|
||||||
|
psi2_local[threadIdx.x] += exp_psi2_n;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(psi2_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {
|
||||||
|
psi2[IDX_MM(m1,m2)] = var*var*psi2_local[0];
|
||||||
|
if(m1!=m2) { psi2[IDX_MM(m2,m1)] = var*var*psi2_local[0]; }
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void psi1compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi1, double *psi1, double *log_denom1, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q)
|
||||||
|
{
|
||||||
|
int m_start, m_end;
|
||||||
|
__shared__ double g_local[THREADNUM];
|
||||||
|
divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
|
||||||
|
int P = int(ceil(double(N)/THREADNUM));
|
||||||
|
|
||||||
|
double dvar_local = 0;
|
||||||
|
for(int q=0;q<Q;q++) {
|
||||||
|
double lq_sqrt = l[q];
|
||||||
|
double lq = lq_sqrt*lq_sqrt;
|
||||||
|
double dl_local = 0;
|
||||||
|
for(int p=0;p<P;p++) {
|
||||||
|
int n = p*THREADNUM + threadIdx.x;
|
||||||
|
double dmu_local = 0;
|
||||||
|
double dS_local = 0;
|
||||||
|
double dgamma_local = 0;
|
||||||
|
double Snq,mu_nq,gnq,log_gnq,log_gnq1,log_de;
|
||||||
|
if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)]; gnq = gamma[IDX_NQ(n,q)];
|
||||||
|
log_gnq = log_gamma[IDX_NQ(n,q)]; log_gnq1 = log_gamma1[IDX_NQ(n,q)];
|
||||||
|
log_de = log_denom1[IDX_NQ(n,q)];}
|
||||||
|
for(int m=m_start; m<m_end; m++) {
|
||||||
|
if(n<N) {
|
||||||
|
double lpsi1 = psi1[IDX_NM(n,m)]*dL_dpsi1[IDX_NM(n,m)];
|
||||||
|
if(q==0) {dvar_local += lpsi1;}
|
||||||
|
|
||||||
|
double Zmq = Z[IDX_MQ(m,q)];
|
||||||
|
double Zmu = Zmq - mu_nq;
|
||||||
|
double denom = Snq+lq;
|
||||||
|
double Zmu2_denom = Zmu*Zmu/denom;
|
||||||
|
|
||||||
|
double exp1 = log_gnq-(Zmu*Zmu/(Snq+lq)+log_de)/(2.);
|
||||||
|
double exp2 = log_gnq1-Zmq*Zmq/(2.*lq);
|
||||||
|
double d_exp1,d_exp2;
|
||||||
|
if(exp1>exp2) {
|
||||||
|
d_exp1 = 1.;
|
||||||
|
d_exp2 = exp(exp2-exp1);
|
||||||
|
} else {
|
||||||
|
d_exp1 = exp(exp1-exp2);
|
||||||
|
d_exp2 = 1.;
|
||||||
|
}
|
||||||
|
double exp_sum = d_exp1+d_exp2;
|
||||||
|
|
||||||
|
dmu_local += lpsi1*Zmu*d_exp1/(denom*exp_sum);
|
||||||
|
dS_local += lpsi1*(Zmu2_denom-1.)*d_exp1/(denom*exp_sum);
|
||||||
|
dgamma_local += lpsi1*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
|
||||||
|
dl_local += lpsi1*((Zmu2_denom+Snq/lq)/denom*d_exp1+Zmq*Zmq/(lq*lq)*d_exp2)/(2.*exp_sum);
|
||||||
|
g_local[threadIdx.x] = lpsi1*(-Zmu/denom*d_exp1-Zmq/lq*d_exp2)/exp_sum;
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dZ[IDX_MQ(m,q)] += g_local[0];}
|
||||||
|
}
|
||||||
|
if(n<N) {
|
||||||
|
dmu[IDX_NQB(n,q,blockIdx.x)] += dmu_local;
|
||||||
|
dS[IDX_NQB(n,q,blockIdx.x)] += dS_local/2.;
|
||||||
|
dgamma[IDX_NQB(n,q,blockIdx.x)] += dgamma_local;
|
||||||
|
}
|
||||||
|
__threadfence_block();
|
||||||
|
}
|
||||||
|
g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
|
||||||
|
}
|
||||||
|
g_local[threadIdx.x] = dvar_local;
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]/var;}
|
||||||
|
}
|
||||||
|
|
||||||
|
__global__ void psi2compDer(double *dvar, double *dl, double *dZ, double *dmu, double *dS, double *dgamma, double *dL_dpsi2, double *psi2n, double *log_denom2, double *log_gamma, double*log_gamma1, double var, double *l, double *Z, double *mu, double *S, double *gamma, int N, int M, int Q)
|
||||||
|
{
|
||||||
|
int m_start, m_end;
|
||||||
|
__shared__ double g_local[THREADNUM];
|
||||||
|
divide_data(M, gridDim.x, blockIdx.x, &m_start, &m_end);
|
||||||
|
int P = int(ceil(double(N)/THREADNUM));
|
||||||
|
|
||||||
|
double dvar_local = 0;
|
||||||
|
for(int q=0;q<Q;q++) {
|
||||||
|
double lq_sqrt = l[q];
|
||||||
|
double lq = lq_sqrt*lq_sqrt;
|
||||||
|
double dl_local = 0;
|
||||||
|
for(int p=0;p<P;p++) {
|
||||||
|
int n = p*THREADNUM + threadIdx.x;
|
||||||
|
double dmu_local = 0;
|
||||||
|
double dS_local = 0;
|
||||||
|
double dgamma_local = 0;
|
||||||
|
double Snq,mu_nq,gnq,log_gnq,log_gnq1,log_de;
|
||||||
|
if(n<N) {Snq = S[IDX_NQ(n,q)]; mu_nq=mu[IDX_NQ(n,q)]; gnq = gamma[IDX_NQ(n,q)];
|
||||||
|
log_gnq = log_gamma[IDX_NQ(n,q)]; log_gnq1 = log_gamma1[IDX_NQ(n,q)];
|
||||||
|
log_de = log_denom2[IDX_NQ(n,q)];}
|
||||||
|
for(int m1=m_start; m1<m_end; m1++) {
|
||||||
|
g_local[threadIdx.x] = 0;
|
||||||
|
for(int m2=0;m2<M;m2++) {
|
||||||
|
if(n<N) {
|
||||||
|
double lpsi2 = psi2n[IDX_NMM(n,m1,m2)]*dL_dpsi2[IDX_MM(m1,m2)];
|
||||||
|
if(q==0) {dvar_local += lpsi2;}
|
||||||
|
|
||||||
|
double Zm1q = Z[IDX_MQ(m1,q)];
|
||||||
|
double Zm2q = Z[IDX_MQ(m2,q)];
|
||||||
|
double dZ = Zm1q - Zm2q;
|
||||||
|
double Z2 = Zm1q*Zm1q+Zm2q*Zm2q;
|
||||||
|
double muZhat = mu_nq - (Zm1q + Zm2q)/2.;
|
||||||
|
double denom = 2.*Snq+lq;
|
||||||
|
double muZhat2_denom = muZhat*muZhat/denom;
|
||||||
|
|
||||||
|
double exp1 = dZ*dZ/(-4.*lq)-muZhat*muZhat/(2.*Snq+lq) - log_de/2. + log_gnq;
|
||||||
|
double exp2 = log_gnq1 - Z2/(2.*lq);
|
||||||
|
double d_exp1,d_exp2;
|
||||||
|
if(exp1>exp2) {
|
||||||
|
d_exp1 = 1.;
|
||||||
|
d_exp2 = exp(exp2-exp1);
|
||||||
|
} else {
|
||||||
|
d_exp1 = exp(exp1-exp2);
|
||||||
|
d_exp2 = 1.;
|
||||||
|
}
|
||||||
|
double exp_sum = d_exp1+d_exp2;
|
||||||
|
|
||||||
|
dmu_local += lpsi2*muZhat/denom*d_exp1/exp_sum;
|
||||||
|
dS_local += lpsi2*(2.*muZhat2_denom-1.)/denom*d_exp1/exp_sum;
|
||||||
|
dgamma_local += lpsi2*(d_exp1/gnq-d_exp2/(1.-gnq))/exp_sum;
|
||||||
|
dl_local += lpsi2*(((Snq/lq+muZhat2_denom)/denom+dZ*dZ/(4.*lq*lq))*d_exp1+Z2/(2.*lq*lq)*d_exp2)/exp_sum;
|
||||||
|
g_local[threadIdx.x] += 2.*lpsi2*((muZhat/denom-dZ/(2*lq))*d_exp1-Zm1q/lq*d_exp2)/exp_sum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, p<P-1?THREADNUM:N-(P-1)*THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dZ[IDX_MQ(m1,q)] += g_local[0];}
|
||||||
|
}
|
||||||
|
if(n<N) {
|
||||||
|
dmu[IDX_NQB(n,q,blockIdx.x)] += -2.*dmu_local;
|
||||||
|
dS[IDX_NQB(n,q,blockIdx.x)] += dS_local;
|
||||||
|
dgamma[IDX_NQB(n,q,blockIdx.x)] += dgamma_local;
|
||||||
|
}
|
||||||
|
__threadfence_block();
|
||||||
|
}
|
||||||
|
g_local[threadIdx.x] = dl_local*2.*lq_sqrt;
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dl[IDX_QB(q,blockIdx.x)] += g_local[0];}
|
||||||
|
}
|
||||||
|
g_local[threadIdx.x] = dvar_local;
|
||||||
|
__syncthreads();
|
||||||
|
reduce_sum(g_local, THREADNUM);
|
||||||
|
if(threadIdx.x==0) {dvar[blockIdx.x] += g_local[0]*2/var;}
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
class PSICOMP_SSRBF_GPU(PSICOMP_RBF):
|
||||||
|
|
||||||
|
def __init__(self, threadnum=128, blocknum=15, GPU_direct=False):
|
||||||
|
self.GPU_direct = GPU_direct
|
||||||
|
self.gpuCache = None
|
||||||
|
|
||||||
|
self.threadnum = threadnum
|
||||||
|
self.blocknum = blocknum
|
||||||
|
module = SourceModule("#define THREADNUM "+str(self.threadnum)+"\n"+gpu_code)
|
||||||
|
self.g_psi1computations = module.get_function('psi1computations')
|
||||||
|
self.g_psi1computations.prepare('PPPPdPPPPiii')
|
||||||
|
self.g_psi2computations = module.get_function('psi2computations')
|
||||||
|
self.g_psi2computations.prepare('PPPPPdPPPPiii')
|
||||||
|
self.g_psi1compDer = module.get_function('psi1compDer')
|
||||||
|
self.g_psi1compDer.prepare('PPPPPPPPPPPdPPPPPiii')
|
||||||
|
self.g_psi2compDer = module.get_function('psi2compDer')
|
||||||
|
self.g_psi2compDer.prepare('PPPPPPPPPPPdPPPPPiii')
|
||||||
|
self.g_compDenom = module.get_function('compDenom')
|
||||||
|
self.g_compDenom.prepare('PPPPPPPii')
|
||||||
|
|
||||||
|
def __deepcopy__(self, memo):
|
||||||
|
s = PSICOMP_SSRBF_GPU(threadnum=self.threadnum, blocknum=self.blocknum, GPU_direct=self.GPU_direct)
|
||||||
|
memo[id(self)] = s
|
||||||
|
return s
|
||||||
|
|
||||||
|
def _initGPUCache(self, N, M, Q):
|
||||||
|
if self.gpuCache == None:
|
||||||
|
self.gpuCache = {
|
||||||
'l_gpu' :gpuarray.empty((Q,),np.float64,order='F'),
|
'l_gpu' :gpuarray.empty((Q,),np.float64,order='F'),
|
||||||
'Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'),
|
'Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'),
|
||||||
'mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
'mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
'S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
'gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'logGamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
|
||||||
'log1Gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
|
||||||
'logpsi1denom_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
|
||||||
'logpsi2denom_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
|
||||||
'psi0_gpu' :gpuarray.empty((N,),np.float64,order='F'),
|
|
||||||
'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
|
'psi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
|
||||||
'psi2_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'),
|
'psi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'),
|
||||||
# derivatives psi1
|
'psi2n_gpu' :gpuarray.empty((N,M,M),np.float64,order='F'),
|
||||||
'psi1_neq_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'),
|
'dL_dpsi1_gpu' :gpuarray.empty((N,M),np.float64,order='F'),
|
||||||
'psi1exp1_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'),
|
'dL_dpsi2_gpu' :gpuarray.empty((M,M),np.float64,order='F'),
|
||||||
'psi1exp2_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'),
|
'log_denom1_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'dpsi1_dvar_gpu' :gpuarray.empty((N,M),np.float64, order='F'),
|
'log_denom2_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'dpsi1_dl_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'),
|
'log_gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'dpsi1_dZ_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'),
|
'log_gamma1_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
||||||
'dpsi1_dgamma_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'),
|
# derivatives
|
||||||
'dpsi1_dmu_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'),
|
'dvar_gpu' :gpuarray.empty((self.blocknum,),np.float64, order='F'),
|
||||||
'dpsi1_dS_gpu' :gpuarray.empty((N,M,Q),np.float64, order='F'),
|
'dl_gpu' :gpuarray.empty((Q,self.blocknum),np.float64, order='F'),
|
||||||
# derivatives psi2
|
'dZ_gpu' :gpuarray.empty((M,Q),np.float64, order='F'),
|
||||||
'psi2_neq_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
|
'dmu_gpu' :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
|
||||||
'psi2exp1_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
|
'dS_gpu' :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
|
||||||
'psi2exp2_gpu' :gpuarray.empty((M,M,Q),np.float64, order='F'),
|
'dgamma_gpu' :gpuarray.empty((N,Q,self.blocknum),np.float64, order='F'),
|
||||||
'dpsi2_dvar_gpu' :gpuarray.empty((N,M,M),np.float64, order='F'),
|
# grad
|
||||||
'dpsi2_dl_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
|
'grad_l_gpu' :gpuarray.empty((Q,),np.float64, order='F'),
|
||||||
'dpsi2_dZ_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
|
'grad_mu_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
||||||
'dpsi2_dgamma_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
|
'grad_S_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
||||||
'dpsi2_dmu_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
|
'grad_gamma_gpu' :gpuarray.empty((N,Q,),np.float64, order='F'),
|
||||||
'dpsi2_dS_gpu' :gpuarray.empty((N,M,M,Q),np.float64, order='F'),
|
|
||||||
# gradients
|
|
||||||
'grad_l_gpu' :gpuarray.empty((Q,),np.float64,order='F'),
|
|
||||||
'grad_Z_gpu' :gpuarray.empty((M,Q),np.float64,order='F'),
|
|
||||||
'grad_mu_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
|
||||||
'grad_S_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
|
||||||
'grad_gamma_gpu' :gpuarray.empty((N,Q),np.float64,order='F'),
|
|
||||||
}
|
}
|
||||||
self.gpuCache = self.gpuCacheAll
|
|
||||||
elif self.gpuCacheAll['mu_gpu'].shape[0]==N:
|
|
||||||
self.gpuCache = self.gpuCacheAll
|
|
||||||
else:
|
else:
|
||||||
# remap to a smaller cache
|
assert N==self.gpuCache['mu_gpu'].shape[0]
|
||||||
self.gpuCache = self.gpuCacheAll.copy()
|
assert M==self.gpuCache['Z_gpu'].shape[0]
|
||||||
Nlist=['mu_gpu','S_gpu','gamma_gpu','logGamma_gpu','log1Gamma_gpu','logpsi1denom_gpu','logpsi2denom_gpu','psi0_gpu','psi1_gpu','psi2_gpu',
|
assert Q==self.gpuCache['l_gpu'].shape[0]
|
||||||
'psi1_neq_gpu','psi1exp1_gpu','psi1exp2_gpu','dpsi1_dvar_gpu','dpsi1_dl_gpu','dpsi1_dZ_gpu','dpsi1_dgamma_gpu','dpsi1_dmu_gpu',
|
|
||||||
'dpsi1_dS_gpu','psi2_neq_gpu','psi2exp1_gpu','dpsi2_dvar_gpu','dpsi2_dl_gpu','dpsi2_dZ_gpu','dpsi2_dgamma_gpu','dpsi2_dmu_gpu','dpsi2_dS_gpu','grad_mu_gpu','grad_S_gpu','grad_gamma_gpu',]
|
|
||||||
oldN = self.gpuCacheAll['mu_gpu'].shape[0]
|
|
||||||
for v in Nlist:
|
|
||||||
u = self.gpuCacheAll[v]
|
|
||||||
self.gpuCache[v] = u.ravel()[:u.size/oldN*N].reshape(*((N,)+u.shape[1:]))
|
|
||||||
|
|
||||||
def _releaseMemory(self):
|
def sync_params(self, lengthscale, Z, mu, S, gamma):
|
||||||
if self.gpuCacheAll!=None:
|
if len(lengthscale)==1:
|
||||||
[v.gpudata.free() for v in self.gpuCacheAll.values()]
|
self.gpuCache['l_gpu'].fill(lengthscale)
|
||||||
self.gpuCacheAll = None
|
else:
|
||||||
self.gpuCache = None
|
self.gpuCache['l_gpu'].set(np.asfortranarray(lengthscale))
|
||||||
|
self.gpuCache['Z_gpu'].set(np.asfortranarray(Z))
|
||||||
|
self.gpuCache['mu_gpu'].set(np.asfortranarray(mu))
|
||||||
|
self.gpuCache['S_gpu'].set(np.asfortranarray(S))
|
||||||
|
self.gpuCache['gamma_gpu'].set(np.asfortranarray(gamma))
|
||||||
|
N,Q = self.gpuCache['S_gpu'].shape
|
||||||
|
self.g_compDenom.prepared_call((self.blocknum,1),(self.threadnum,1,1), self.gpuCache['log_denom1_gpu'].gpudata,self.gpuCache['log_denom2_gpu'].gpudata,self.gpuCache['log_gamma_gpu'].gpudata,self.gpuCache['log_gamma1_gpu'].gpudata,self.gpuCache['gamma_gpu'].gpudata,self.gpuCache['l_gpu'].gpudata,self.gpuCache['S_gpu'].gpudata, np.int32(N), np.int32(Q))
|
||||||
|
|
||||||
|
def reset_derivative(self):
|
||||||
|
self.gpuCache['dvar_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dl_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dZ_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dmu_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dS_gpu'].fill(0.)
|
||||||
|
self.gpuCache['dgamma_gpu'].fill(0.)
|
||||||
|
self.gpuCache['grad_l_gpu'].fill(0.)
|
||||||
|
self.gpuCache['grad_mu_gpu'].fill(0.)
|
||||||
|
self.gpuCache['grad_S_gpu'].fill(0.)
|
||||||
|
self.gpuCache['grad_gamma_gpu'].fill(0.)
|
||||||
|
|
||||||
def estimateMemoryOccupation(self, N, M, Q):
|
def get_dimensions(self, Z, variational_posterior):
|
||||||
"""
|
return variational_posterior.mean.shape[0], Z.shape[0], Z.shape[1]
|
||||||
Estimate the best batch size.
|
|
||||||
N - the number of total datapoints
|
|
||||||
M - the number of inducing points
|
|
||||||
Q - the number of hidden (input) dimensions
|
|
||||||
return: the constant memory size, the memory occupation of batchsize=1
|
|
||||||
unit: GB
|
|
||||||
"""
|
|
||||||
return (2.*Q+2.*M*Q+M*M*Q)*8./1024./1024./1024., (1.+2.*M+10.*Q+2.*M*M+8.*M*Q+7.*M*M*Q)*8./1024./1024./1024.
|
|
||||||
|
|
||||||
@Cache_this(limit=1,ignore_args=(0,))
|
@Cache_this(limit=1, ignore_args=(0,))
|
||||||
def psicomputations(self, variance, lengthscale, Z, mu, S, gamma):
|
def psicomputations(self, variance, lengthscale, Z, variational_posterior):
|
||||||
"""Compute Psi statitsitcs"""
|
"""
|
||||||
if isinstance(lengthscale, np.ndarray) and len(lengthscale)>1:
|
Z - MxQ
|
||||||
ARD = True
|
mu - NxQ
|
||||||
else:
|
S - NxQ
|
||||||
ARD = False
|
"""
|
||||||
|
N,M,Q = self.get_dimensions(Z, variational_posterior)
|
||||||
N = mu.shape[0]
|
|
||||||
M = Z.shape[0]
|
|
||||||
Q = mu.shape[1]
|
|
||||||
|
|
||||||
self._initGPUCache(N,M,Q)
|
self._initGPUCache(N,M,Q)
|
||||||
l_gpu = self.gpuCache['l_gpu']
|
self.sync_params(lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
||||||
Z_gpu = self.gpuCache['Z_gpu']
|
|
||||||
mu_gpu = self.gpuCache['mu_gpu']
|
|
||||||
S_gpu = self.gpuCache['S_gpu']
|
|
||||||
gamma_gpu = self.gpuCache['gamma_gpu']
|
|
||||||
logGamma_gpu = self.gpuCache['logGamma_gpu']
|
|
||||||
log1Gamma_gpu = self.gpuCache['log1Gamma_gpu']
|
|
||||||
logpsi1denom_gpu = self.gpuCache['logpsi1denom_gpu']
|
|
||||||
logpsi2denom_gpu = self.gpuCache['logpsi2denom_gpu']
|
|
||||||
psi0_gpu = self.gpuCache['psi0_gpu']
|
|
||||||
psi1_gpu = self.gpuCache['psi1_gpu']
|
psi1_gpu = self.gpuCache['psi1_gpu']
|
||||||
psi2_gpu = self.gpuCache['psi2_gpu']
|
psi2_gpu = self.gpuCache['psi2_gpu']
|
||||||
|
psi2n_gpu = self.gpuCache['psi2n_gpu']
|
||||||
|
l_gpu = self.gpuCache['l_gpu']
|
||||||
|
Z_gpu = self.gpuCache['Z_gpu']
|
||||||
|
mu_gpu = self.gpuCache['mu_gpu']
|
||||||
|
S_gpu = self.gpuCache['S_gpu']
|
||||||
|
log_denom1_gpu = self.gpuCache['log_denom1_gpu']
|
||||||
|
log_denom2_gpu = self.gpuCache['log_denom2_gpu']
|
||||||
|
log_gamma_gpu = self.gpuCache['log_gamma_gpu']
|
||||||
|
log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
|
||||||
|
|
||||||
if ARD:
|
psi0 = np.empty((N,))
|
||||||
l_gpu.set(np.asfortranarray(lengthscale**2))
|
psi0[:] = variance
|
||||||
|
self.g_psi1computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
|
||||||
|
self.g_psi2computations.prepared_call((self.blocknum,1),(self.threadnum,1,1),psi2_gpu.gpudata, psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata, np.int32(N), np.int32(M), np.int32(Q))
|
||||||
|
|
||||||
|
if self.GPU_direct:
|
||||||
|
return psi0, psi1_gpu, psi2_gpu
|
||||||
else:
|
else:
|
||||||
l_gpu.fill(lengthscale*lengthscale)
|
return psi0, psi1_gpu.get(), psi2_gpu.get()
|
||||||
Z_gpu.set(np.asfortranarray(Z))
|
|
||||||
mu_gpu.set(np.asfortranarray(mu))
|
@Cache_this(limit=1, ignore_args=(0,1,2,3))
|
||||||
S_gpu.set(np.asfortranarray(S))
|
def psiDerivativecomputations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
||||||
gamma_gpu.set(np.asfortranarray(gamma))
|
ARD = (len(lengthscale)!=1)
|
||||||
linalg_gpu.log(gamma_gpu,logGamma_gpu)
|
|
||||||
linalg_gpu.logOne(gamma_gpu,log1Gamma_gpu)
|
|
||||||
comp_logpsidenom(logpsi1denom_gpu, S_gpu,l_gpu,1.0,N)
|
|
||||||
comp_logpsidenom(logpsi2denom_gpu, S_gpu,l_gpu,2.0,N)
|
|
||||||
|
|
||||||
psi0_gpu.fill(variance)
|
N,M,Q = self.get_dimensions(Z, variational_posterior)
|
||||||
comp_psi1(psi1_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q)
|
psi1_gpu = self.gpuCache['psi1_gpu']
|
||||||
comp_psi2(psi2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q)
|
psi2n_gpu = self.gpuCache['psi2n_gpu']
|
||||||
|
|
||||||
return psi0_gpu, psi1_gpu, psi2_gpu
|
|
||||||
|
|
||||||
@Cache_this(limit=1,ignore_args=(0,))
|
|
||||||
def _psiDercomputations(self, variance, lengthscale, Z, mu, S, gamma):
|
|
||||||
"""Compute the derivatives w.r.t. Psi statistics"""
|
|
||||||
N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1]
|
|
||||||
|
|
||||||
self._initGPUCache(N,M,Q)
|
|
||||||
l_gpu = self.gpuCache['l_gpu']
|
l_gpu = self.gpuCache['l_gpu']
|
||||||
Z_gpu = self.gpuCache['Z_gpu']
|
Z_gpu = self.gpuCache['Z_gpu']
|
||||||
mu_gpu = self.gpuCache['mu_gpu']
|
mu_gpu = self.gpuCache['mu_gpu']
|
||||||
S_gpu = self.gpuCache['S_gpu']
|
S_gpu = self.gpuCache['S_gpu']
|
||||||
gamma_gpu = self.gpuCache['gamma_gpu']
|
gamma_gpu = self.gpuCache['gamma_gpu']
|
||||||
logGamma_gpu = self.gpuCache['logGamma_gpu']
|
dvar_gpu = self.gpuCache['dvar_gpu']
|
||||||
log1Gamma_gpu = self.gpuCache['log1Gamma_gpu']
|
dl_gpu = self.gpuCache['dl_gpu']
|
||||||
logpsi1denom_gpu = self.gpuCache['logpsi1denom_gpu']
|
dZ_gpu = self.gpuCache['dZ_gpu']
|
||||||
logpsi2denom_gpu = self.gpuCache['logpsi2denom_gpu']
|
dmu_gpu = self.gpuCache['dmu_gpu']
|
||||||
|
dS_gpu = self.gpuCache['dS_gpu']
|
||||||
psi1_neq_gpu = self.gpuCache['psi1_neq_gpu']
|
dgamma_gpu = self.gpuCache['dgamma_gpu']
|
||||||
psi1exp1_gpu = self.gpuCache['psi1exp1_gpu']
|
|
||||||
psi1exp2_gpu = self.gpuCache['psi1exp2_gpu']
|
|
||||||
dpsi1_dvar_gpu = self.gpuCache['dpsi1_dvar_gpu']
|
|
||||||
dpsi1_dl_gpu = self.gpuCache['dpsi1_dl_gpu']
|
|
||||||
dpsi1_dZ_gpu = self.gpuCache['dpsi1_dZ_gpu']
|
|
||||||
dpsi1_dgamma_gpu = self.gpuCache['dpsi1_dgamma_gpu']
|
|
||||||
dpsi1_dmu_gpu = self.gpuCache['dpsi1_dmu_gpu']
|
|
||||||
dpsi1_dS_gpu = self.gpuCache['dpsi1_dS_gpu']
|
|
||||||
|
|
||||||
psi2_neq_gpu = self.gpuCache['psi2_neq_gpu']
|
|
||||||
psi2exp1_gpu = self.gpuCache['psi2exp1_gpu']
|
|
||||||
psi2exp2_gpu = self.gpuCache['psi2exp2_gpu']
|
|
||||||
dpsi2_dvar_gpu = self.gpuCache['dpsi2_dvar_gpu']
|
|
||||||
dpsi2_dl_gpu = self.gpuCache['dpsi2_dl_gpu']
|
|
||||||
dpsi2_dZ_gpu = self.gpuCache['dpsi2_dZ_gpu']
|
|
||||||
dpsi2_dgamma_gpu = self.gpuCache['dpsi2_dgamma_gpu']
|
|
||||||
dpsi2_dmu_gpu = self.gpuCache['dpsi2_dmu_gpu']
|
|
||||||
dpsi2_dS_gpu = self.gpuCache['dpsi2_dS_gpu']
|
|
||||||
|
|
||||||
#==========================================================================================================
|
|
||||||
# Assuming the l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, logGamma_gpu, log1Gamma_gpu,
|
|
||||||
# logpsi1denom_gpu, logpsi2denom_gpu has been synchonized.
|
|
||||||
#==========================================================================================================
|
|
||||||
|
|
||||||
# psi1 derivatives
|
|
||||||
comp_dpsi1_dvar(dpsi1_dvar_gpu, psi1_neq_gpu, psi1exp1_gpu,psi1exp2_gpu, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi1denom_gpu, N, M, Q)
|
|
||||||
comp_psi1_der(dpsi1_dl_gpu,dpsi1_dmu_gpu,dpsi1_dS_gpu,dpsi1_dgamma_gpu, dpsi1_dZ_gpu, psi1_neq_gpu,psi1exp1_gpu,psi1exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q)
|
|
||||||
|
|
||||||
# psi2 derivatives
|
|
||||||
comp_dpsi2_dvar(dpsi2_dvar_gpu, psi2_neq_gpu, psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, logGamma_gpu, log1Gamma_gpu, logpsi2denom_gpu, N, M, Q)
|
|
||||||
comp_psi2_der(dpsi2_dl_gpu,dpsi2_dmu_gpu,dpsi2_dS_gpu,dpsi2_dgamma_gpu, dpsi2_dZ_gpu, psi2_neq_gpu,psi2exp1_gpu,psi2exp2_gpu, variance, l_gpu, Z_gpu, mu_gpu, S_gpu, gamma_gpu, N, M, Q)
|
|
||||||
|
|
||||||
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
|
||||||
mu = variational_posterior.mean
|
|
||||||
S = variational_posterior.variance
|
|
||||||
gamma = variational_posterior.binary_prob
|
|
||||||
self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma)
|
|
||||||
N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1]
|
|
||||||
|
|
||||||
if isinstance(lengthscale, np.ndarray) and len(lengthscale)>1:
|
|
||||||
ARD = True
|
|
||||||
else:
|
|
||||||
ARD = False
|
|
||||||
|
|
||||||
dpsi1_dvar_gpu = self.gpuCache['dpsi1_dvar_gpu']
|
|
||||||
dpsi2_dvar_gpu = self.gpuCache['dpsi2_dvar_gpu']
|
|
||||||
dpsi1_dl_gpu = self.gpuCache['dpsi1_dl_gpu']
|
|
||||||
dpsi2_dl_gpu = self.gpuCache['dpsi2_dl_gpu']
|
|
||||||
psi1_comb_gpu = self.gpuCache['psi1_neq_gpu']
|
|
||||||
psi2_comb_gpu = self.gpuCache['psi2_neq_gpu']
|
|
||||||
grad_l_gpu = self.gpuCache['grad_l_gpu']
|
grad_l_gpu = self.gpuCache['grad_l_gpu']
|
||||||
|
|
||||||
# variance
|
|
||||||
variance.gradient = gpuarray.sum(dL_dpsi0).get() \
|
|
||||||
+ cublas.cublasDdot(self.cublas_handle, dL_dpsi1.size, dL_dpsi1.gpudata, 1, dpsi1_dvar_gpu.gpudata, 1) \
|
|
||||||
+ cublas.cublasDdot(self.cublas_handle, dL_dpsi2.size, dL_dpsi2.gpudata, 1, dpsi2_dvar_gpu.gpudata, 1)
|
|
||||||
|
|
||||||
# lengscale
|
|
||||||
if ARD:
|
|
||||||
grad_l_gpu.fill(0.)
|
|
||||||
linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size)
|
|
||||||
linalg_gpu.sum_axis(grad_l_gpu, psi1_comb_gpu, 1, N*M)
|
|
||||||
linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size)
|
|
||||||
linalg_gpu.sum_axis(grad_l_gpu, psi2_comb_gpu, 1, N*M*M)
|
|
||||||
lengthscale.gradient = grad_l_gpu.get()
|
|
||||||
else:
|
|
||||||
linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dl_gpu, dL_dpsi1.size)
|
|
||||||
linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dl_gpu, dL_dpsi2.size)
|
|
||||||
lengthscale.gradient = gpuarray.sum(psi1_comb_gpu).get() + gpuarray.sum(psi2_comb_gpu).get()
|
|
||||||
|
|
||||||
def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
|
||||||
mu = variational_posterior.mean
|
|
||||||
S = variational_posterior.variance
|
|
||||||
gamma = variational_posterior.binary_prob
|
|
||||||
self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma)
|
|
||||||
N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1]
|
|
||||||
|
|
||||||
dpsi1_dZ_gpu = self.gpuCache['dpsi1_dZ_gpu']
|
|
||||||
dpsi2_dZ_gpu = self.gpuCache['dpsi2_dZ_gpu']
|
|
||||||
psi1_comb_gpu = self.gpuCache['psi1_neq_gpu']
|
|
||||||
psi2_comb_gpu = self.gpuCache['psi2_neq_gpu']
|
|
||||||
grad_Z_gpu = self.gpuCache['grad_Z_gpu']
|
|
||||||
|
|
||||||
grad_Z_gpu.fill(0.)
|
|
||||||
linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dZ_gpu, dL_dpsi1.size)
|
|
||||||
linalg_gpu.sum_axis(grad_Z_gpu, psi1_comb_gpu, 1, N)
|
|
||||||
linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dZ_gpu, dL_dpsi2.size)
|
|
||||||
linalg_gpu.sum_axis(grad_Z_gpu, psi2_comb_gpu, 1, N*M)
|
|
||||||
return grad_Z_gpu.get()
|
|
||||||
|
|
||||||
def gradients_qX_expectations(self, dL_dpsi1, dL_dpsi2, variance, lengthscale, Z, variational_posterior):
|
|
||||||
mu = variational_posterior.mean
|
|
||||||
S = variational_posterior.variance
|
|
||||||
gamma = variational_posterior.binary_prob
|
|
||||||
self._psiDercomputations(variance, lengthscale, Z, mu, S, gamma)
|
|
||||||
N, M, Q = mu.shape[0],Z.shape[0], mu.shape[1]
|
|
||||||
|
|
||||||
dpsi1_dmu_gpu = self.gpuCache['dpsi1_dmu_gpu']
|
|
||||||
dpsi2_dmu_gpu = self.gpuCache['dpsi2_dmu_gpu']
|
|
||||||
dpsi1_dS_gpu = self.gpuCache['dpsi1_dS_gpu']
|
|
||||||
dpsi2_dS_gpu = self.gpuCache['dpsi2_dS_gpu']
|
|
||||||
dpsi1_dgamma_gpu = self.gpuCache['dpsi1_dgamma_gpu']
|
|
||||||
dpsi2_dgamma_gpu = self.gpuCache['dpsi2_dgamma_gpu']
|
|
||||||
psi1_comb_gpu = self.gpuCache['psi1_neq_gpu']
|
|
||||||
psi2_comb_gpu = self.gpuCache['psi2_neq_gpu']
|
|
||||||
grad_mu_gpu = self.gpuCache['grad_mu_gpu']
|
grad_mu_gpu = self.gpuCache['grad_mu_gpu']
|
||||||
grad_S_gpu = self.gpuCache['grad_S_gpu']
|
grad_S_gpu = self.gpuCache['grad_S_gpu']
|
||||||
grad_gamma_gpu = self.gpuCache['grad_gamma_gpu']
|
grad_gamma_gpu = self.gpuCache['grad_gamma_gpu']
|
||||||
|
log_denom1_gpu = self.gpuCache['log_denom1_gpu']
|
||||||
|
log_denom2_gpu = self.gpuCache['log_denom2_gpu']
|
||||||
|
log_gamma_gpu = self.gpuCache['log_gamma_gpu']
|
||||||
|
log_gamma1_gpu = self.gpuCache['log_gamma1_gpu']
|
||||||
|
|
||||||
# mu gradients
|
if self.GPU_direct:
|
||||||
grad_mu_gpu.fill(0.)
|
dL_dpsi1_gpu = dL_dpsi1
|
||||||
linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dmu_gpu, dL_dpsi1.size)
|
dL_dpsi2_gpu = dL_dpsi2
|
||||||
linalg_gpu.sum_axis(grad_mu_gpu, psi1_comb_gpu, N, M)
|
dL_dpsi0_sum = gpuarray.sum(dL_dpsi0).get()
|
||||||
linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dmu_gpu, dL_dpsi2.size)
|
else:
|
||||||
linalg_gpu.sum_axis(grad_mu_gpu, psi2_comb_gpu, N, M*M)
|
dL_dpsi1_gpu = self.gpuCache['dL_dpsi1_gpu']
|
||||||
|
dL_dpsi2_gpu = self.gpuCache['dL_dpsi2_gpu']
|
||||||
|
dL_dpsi1_gpu.set(np.asfortranarray(dL_dpsi1))
|
||||||
|
dL_dpsi2_gpu.set(np.asfortranarray(dL_dpsi2))
|
||||||
|
dL_dpsi0_sum = dL_dpsi0.sum()
|
||||||
|
|
||||||
# S gradients
|
self.reset_derivative()
|
||||||
grad_S_gpu.fill(0.)
|
# t=self.g_psi1compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi1_gpu,psi1_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
|
||||||
linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dS_gpu, dL_dpsi1.size)
|
# print 'g_psi1compDer '+str(t)
|
||||||
linalg_gpu.sum_axis(grad_S_gpu, psi1_comb_gpu, N, M)
|
# t=self.g_psi2compDer(dvar_gpu,dl_gpu,dZ_gpu,dmu_gpu,dS_gpu,dL_dpsi2_gpu,psi2n_gpu, np.float64(variance),l_gpu,Z_gpu,mu_gpu,S_gpu, np.int32(N), np.int32(M), np.int32(Q), block=(self.threadnum,1,1), grid=(self.blocknum,1),time_kernel=True)
|
||||||
linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dS_gpu, dL_dpsi2.size)
|
# print 'g_psi2compDer '+str(t)
|
||||||
linalg_gpu.sum_axis(grad_S_gpu, psi2_comb_gpu, N, M*M)
|
self.g_psi1compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi1_gpu.gpudata,psi1_gpu.gpudata, log_denom1_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
|
||||||
|
self.g_psi2compDer.prepared_call((self.blocknum,1),(self.threadnum,1,1),dvar_gpu.gpudata,dl_gpu.gpudata,dZ_gpu.gpudata,dmu_gpu.gpudata,dS_gpu.gpudata,dgamma_gpu.gpudata,dL_dpsi2_gpu.gpudata,psi2n_gpu.gpudata, log_denom2_gpu.gpudata, log_gamma_gpu.gpudata, log_gamma1_gpu.gpudata, np.float64(variance),l_gpu.gpudata,Z_gpu.gpudata,mu_gpu.gpudata,S_gpu.gpudata,gamma_gpu.gpudata,np.int32(N), np.int32(M), np.int32(Q))
|
||||||
|
|
||||||
|
dL_dvar = dL_dpsi0_sum + gpuarray.sum(dvar_gpu).get()
|
||||||
|
sum_axis(grad_mu_gpu,dmu_gpu,N*Q,self.blocknum)
|
||||||
|
dL_dmu = grad_mu_gpu.get()
|
||||||
|
sum_axis(grad_S_gpu,dS_gpu,N*Q,self.blocknum)
|
||||||
|
dL_dS = grad_S_gpu.get()
|
||||||
|
sum_axis(grad_gamma_gpu,dgamma_gpu,N*Q,self.blocknum)
|
||||||
|
dL_dgamma = grad_gamma_gpu.get()
|
||||||
|
dL_dZ = dZ_gpu.get()
|
||||||
|
if ARD:
|
||||||
|
sum_axis(grad_l_gpu,dl_gpu,Q,self.blocknum)
|
||||||
|
dL_dlengscale = grad_l_gpu.get()
|
||||||
|
else:
|
||||||
|
dL_dlengscale = gpuarray.sum(dl_gpu).get()
|
||||||
|
|
||||||
|
return dL_dvar, dL_dlengscale, dL_dZ, dL_dmu, dL_dS, dL_dgamma
|
||||||
|
|
||||||
|
|
||||||
# gamma gradients
|
|
||||||
grad_gamma_gpu.fill(0.)
|
|
||||||
linalg_gpu.mul_bcast(psi1_comb_gpu, dL_dpsi1, dpsi1_dgamma_gpu, dL_dpsi1.size)
|
|
||||||
linalg_gpu.sum_axis(grad_gamma_gpu, psi1_comb_gpu, N, M)
|
|
||||||
linalg_gpu.mul_bcast(psi2_comb_gpu, dL_dpsi2, dpsi2_dgamma_gpu, dL_dpsi2.size)
|
|
||||||
linalg_gpu.sum_axis(grad_gamma_gpu, psi2_comb_gpu, N, M*M)
|
|
||||||
|
|
||||||
return grad_mu_gpu.get(), grad_S_gpu.get(), grad_gamma_gpu.get()
|
|
||||||
|
|
|
||||||
|
|
@ -3,13 +3,9 @@
|
||||||
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from scipy import weave
|
|
||||||
from ...util.misc import param_to_array
|
|
||||||
from stationary import Stationary
|
from stationary import Stationary
|
||||||
from GPy.util.caching import Cache_this
|
from psi_comp import PSICOMP_RBF
|
||||||
from ...core.parameterization import variational
|
from psi_comp.rbf_psi_gpucomp import PSICOMP_RBF_GPU
|
||||||
from psi_comp import ssrbf_psi_comp
|
|
||||||
from psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF
|
|
||||||
from ...util.config import *
|
from ...util.config import *
|
||||||
|
|
||||||
class RBF(Stationary):
|
class RBF(Stationary):
|
||||||
|
|
@ -26,10 +22,11 @@ class RBF(Stationary):
|
||||||
super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
|
super(RBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name, useGPU=useGPU)
|
||||||
self.weave_options = {}
|
self.weave_options = {}
|
||||||
self.group_spike_prob = False
|
self.group_spike_prob = False
|
||||||
|
self.psicomp = PSICOMP_RBF()
|
||||||
if self.useGPU:
|
if self.useGPU:
|
||||||
self.psicomp = PSICOMP_SSRBF()
|
self.psicomp = PSICOMP_RBF_GPU()
|
||||||
|
else:
|
||||||
|
self.psicomp = PSICOMP_RBF()
|
||||||
|
|
||||||
def K_of_r(self, r):
|
def K_of_r(self, r):
|
||||||
return self.variance * np.exp(-0.5 * r**2)
|
return self.variance * np.exp(-0.5 * r**2)
|
||||||
|
|
@ -37,6 +34,15 @@ class RBF(Stationary):
|
||||||
def dK_dr(self, r):
|
def dK_dr(self, r):
|
||||||
return -r*self.K_of_r(r)
|
return -r*self.K_of_r(r)
|
||||||
|
|
||||||
|
def __getstate__(self):
|
||||||
|
dc = super(RBF, self).__getstate__()
|
||||||
|
if self.useGPU:
|
||||||
|
dc['psicomp'] = PSICOMP_RBF()
|
||||||
|
return dc
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
return super(RBF, self).__setstate__(state)
|
||||||
|
|
||||||
def spectrum(self, omega):
|
def spectrum(self, omega):
|
||||||
assert self.input_dim == 1 #TODO: higher dim spectra?
|
assert self.input_dim == 1 #TODO: higher dim spectra?
|
||||||
return self.variance*np.sqrt(2*np.pi)*self.lengthscale*np.exp(-self.lengthscale*2*omega**2/2)
|
return self.variance*np.sqrt(2*np.pi)*self.lengthscale*np.exp(-self.lengthscale*2*omega**2/2)
|
||||||
|
|
@ -46,331 +52,22 @@ class RBF(Stationary):
|
||||||
#---------------------------------------#
|
#---------------------------------------#
|
||||||
|
|
||||||
def psi0(self, Z, variational_posterior):
|
def psi0(self, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[0]
|
||||||
if self.useGPU:
|
|
||||||
return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
|
|
||||||
else:
|
|
||||||
return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[0]
|
|
||||||
else:
|
|
||||||
return self.Kdiag(variational_posterior.mean)
|
|
||||||
|
|
||||||
def psi1(self, Z, variational_posterior):
|
def psi1(self, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[1]
|
||||||
if self.useGPU:
|
|
||||||
return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
|
|
||||||
else:
|
|
||||||
return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[1]
|
|
||||||
else:
|
|
||||||
_, _, _, psi1 = self._psi1computations(Z, variational_posterior)
|
|
||||||
return psi1
|
|
||||||
|
|
||||||
def psi2(self, Z, variational_posterior):
|
def psi2(self, Z, variational_posterior):
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior)[2]
|
||||||
if self.useGPU:
|
|
||||||
return self.psicomp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
|
|
||||||
else:
|
|
||||||
return ssrbf_psi_comp.psicomputations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)[2]
|
|
||||||
else:
|
|
||||||
_, _, _, _, psi2 = self._psi2computations(Z, variational_posterior)
|
|
||||||
return psi2
|
|
||||||
|
|
||||||
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
||||||
# Spike-and-Slab GPLVM
|
dL_dvar, dL_dlengscale = self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[:2]
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
self.variance.gradient = dL_dvar
|
||||||
if self.useGPU:
|
self.lengthscale.gradient = dL_dlengscale
|
||||||
self.psicomp.update_gradients_expectations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
|
|
||||||
else:
|
|
||||||
# dL_dvar, dL_dlengscale, dL_dZ, dL_dgamma, dL_dmu, dL_dS = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
|
|
||||||
dL_dvar, dL_dlengscale, _, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
|
|
||||||
self.variance.gradient = dL_dvar
|
|
||||||
self.lengthscale.gradient = dL_dlengscale
|
|
||||||
|
|
||||||
# _, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
# _, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
#
|
|
||||||
# #contributions from psi0:
|
|
||||||
# self.variance.gradient = np.sum(dL_dpsi0)
|
|
||||||
#
|
|
||||||
# #from psi1
|
|
||||||
# self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
|
|
||||||
# if self.ARD:
|
|
||||||
# self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
|
|
||||||
# else:
|
|
||||||
# self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).sum()
|
|
||||||
#
|
|
||||||
# #from psi2
|
|
||||||
# self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
|
|
||||||
# if self.ARD:
|
|
||||||
# self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
|
|
||||||
# else:
|
|
||||||
# self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).sum()
|
|
||||||
|
|
||||||
elif isinstance(variational_posterior, variational.NormalPosterior):
|
|
||||||
l2 = self.lengthscale**2
|
|
||||||
if l2.size != self.input_dim:
|
|
||||||
l2 = l2*np.ones(self.input_dim)
|
|
||||||
|
|
||||||
#contributions from psi0:
|
|
||||||
self.variance.gradient = np.sum(dL_dpsi0)
|
|
||||||
self.lengthscale.gradient = 0.
|
|
||||||
|
|
||||||
#from psi1
|
|
||||||
denom, _, dist_sq, psi1 = self._psi1computations(Z, variational_posterior)
|
|
||||||
d_length = psi1[:,:,None] * ((dist_sq - 1.)/(self.lengthscale*denom) +1./self.lengthscale)
|
|
||||||
dpsi1_dlength = d_length * dL_dpsi1[:, :, None]
|
|
||||||
if self.ARD:
|
|
||||||
self.lengthscale.gradient += dpsi1_dlength.sum(0).sum(0)
|
|
||||||
else:
|
|
||||||
self.lengthscale.gradient += dpsi1_dlength.sum()
|
|
||||||
self.variance.gradient += np.sum(dL_dpsi1 * psi1) / self.variance
|
|
||||||
#from psi2
|
|
||||||
S = variational_posterior.variance
|
|
||||||
_, Zdist_sq, _, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior)
|
|
||||||
if not self.ARD:
|
|
||||||
self.lengthscale.gradient += self._weave_psi2_lengthscale_grads(dL_dpsi2, psi2, Zdist_sq, S, mudist_sq, l2).sum()
|
|
||||||
else:
|
|
||||||
self.lengthscale.gradient += self._weave_psi2_lengthscale_grads(dL_dpsi2, psi2, Zdist_sq, S, mudist_sq, l2)
|
|
||||||
|
|
||||||
self.variance.gradient += 2.*np.sum(dL_dpsi2 * psi2)/self.variance
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError, "unknown distriubtion received for psi-statistics"
|
|
||||||
|
|
||||||
def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
def gradients_Z_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
||||||
# Spike-and-Slab GPLVM
|
return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[2]
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
|
||||||
if self.useGPU:
|
|
||||||
return self.psicomp.gradients_Z_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
|
|
||||||
else:
|
|
||||||
_, _, dL_dZ, _, _, _ = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
|
|
||||||
return dL_dZ
|
|
||||||
|
|
||||||
# _, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
# _, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
#
|
|
||||||
# #psi1
|
|
||||||
# grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
|
|
||||||
#
|
|
||||||
# #psi2
|
|
||||||
# grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
|
|
||||||
#
|
|
||||||
# return grad
|
|
||||||
|
|
||||||
elif isinstance(variational_posterior, variational.NormalPosterior):
|
|
||||||
l2 = self.lengthscale **2
|
|
||||||
|
|
||||||
#psi1
|
|
||||||
denom, dist, dist_sq, psi1 = self._psi1computations(Z, variational_posterior)
|
|
||||||
grad = np.einsum('ij,ij,ijk,ijk->jk', dL_dpsi1, psi1, dist, -1./(denom*l2))
|
|
||||||
|
|
||||||
#psi2
|
|
||||||
Zdist, Zdist_sq, mudist, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior)
|
|
||||||
term1 = Zdist / l2 # M, M, Q
|
|
||||||
S = variational_posterior.variance
|
|
||||||
term2 = mudist / (2.*S[:,None,None,:] + l2) # N, M, M, Q
|
|
||||||
|
|
||||||
grad += 2.*np.einsum('ijk,ijk,ijkl->kl', dL_dpsi2, psi2, term1[None,:,:,:] + term2)
|
|
||||||
|
|
||||||
return grad
|
|
||||||
else:
|
|
||||||
raise ValueError, "unknown distriubtion received for psi-statistics"
|
|
||||||
|
|
||||||
def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
||||||
# Spike-and-Slab GPLVM
|
return self.psicomp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)[3:]
|
||||||
if isinstance(variational_posterior, variational.SpikeAndSlabPosterior):
|
|
||||||
if self.useGPU:
|
|
||||||
return self.psicomp.gradients_qX_expectations(dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
|
|
||||||
else:
|
|
||||||
_, _, _, dL_dmu, dL_dS, dL_dgamma = ssrbf_psi_comp.psiDerivativecomputations(dL_dpsi0, dL_dpsi1, dL_dpsi2, self.variance, self.lengthscale, Z, variational_posterior)
|
|
||||||
return dL_dmu, dL_dS, dL_dgamma
|
|
||||||
|
|
||||||
# ndata = variational_posterior.mean.shape[0]
|
|
||||||
#
|
|
||||||
# _, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
# _, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
#
|
|
||||||
# #psi1
|
|
||||||
# grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
|
|
||||||
# grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
|
|
||||||
# grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
|
|
||||||
#
|
|
||||||
# #psi2
|
|
||||||
# grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
|
|
||||||
# grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
|
|
||||||
# grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
|
|
||||||
#
|
|
||||||
# if self.group_spike_prob:
|
|
||||||
# grad_gamma[:] = grad_gamma.mean(axis=0)
|
|
||||||
#
|
|
||||||
# return grad_mu, grad_S, grad_gamma
|
|
||||||
|
|
||||||
elif isinstance(variational_posterior, variational.NormalPosterior):
|
|
||||||
|
|
||||||
l2 = self.lengthscale **2
|
|
||||||
#psi1
|
|
||||||
denom, dist, dist_sq, psi1 = self._psi1computations(Z, variational_posterior)
|
|
||||||
tmp = psi1[:, :, None] / l2 / denom
|
|
||||||
grad_mu = np.sum(dL_dpsi1[:, :, None] * tmp * dist, 1)
|
|
||||||
grad_S = np.sum(dL_dpsi1[:, :, None] * 0.5 * tmp * (dist_sq - 1), 1)
|
|
||||||
#psi2
|
|
||||||
_, _, mudist, mudist_sq, psi2 = self._psi2computations(Z, variational_posterior)
|
|
||||||
S = variational_posterior.variance
|
|
||||||
tmp = psi2[:, :, :, None] / (2.*S[:,None,None,:] + l2)
|
|
||||||
grad_mu += -2.*np.einsum('ijk,ijkl,ijkl->il', dL_dpsi2, tmp , mudist)
|
|
||||||
grad_S += np.einsum('ijk,ijkl,ijkl->il', dL_dpsi2 , tmp , (2.*mudist_sq - 1))
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise ValueError, "unknown distriubtion received for psi-statistics"
|
|
||||||
|
|
||||||
return grad_mu, grad_S
|
|
||||||
|
|
||||||
#---------------------------------------#
|
|
||||||
# Precomputations #
|
|
||||||
#---------------------------------------#
|
|
||||||
|
|
||||||
@Cache_this(limit=1)
|
|
||||||
def _psi1computations(self, Z, vp):
|
|
||||||
mu, S = vp.mean, vp.variance
|
|
||||||
l2 = self.lengthscale **2
|
|
||||||
denom = S[:, None, :] / l2 + 1. # N,1,Q
|
|
||||||
dist = Z[None, :, :] - mu[:, None, :] # N,M,Q
|
|
||||||
dist_sq = np.square(dist) / l2 / denom # N,M,Q
|
|
||||||
exponent = -0.5 * np.sum(dist_sq + np.log(denom), -1)#N,M
|
|
||||||
psi1 = self.variance * np.exp(exponent) # N,M
|
|
||||||
return denom, dist, dist_sq, psi1
|
|
||||||
|
|
||||||
|
|
||||||
@Cache_this(limit=1, ignore_args=(0,))
|
|
||||||
def _Z_distances(self, Z):
|
|
||||||
Zhat = 0.5 * (Z[:, None, :] + Z[None, :, :]) # M,M,Q
|
|
||||||
Zdist = 0.5 * (Z[:, None, :] - Z[None, :, :]) # M,M,Q
|
|
||||||
return Zhat, Zdist
|
|
||||||
|
|
||||||
@Cache_this(limit=1)
|
|
||||||
def _psi2computations(self, Z, vp):
|
|
||||||
|
|
||||||
if config.getboolean('parallel', 'openmp'):
|
|
||||||
pragma_string = '#pragma omp parallel for private(tmp, exponent_tmp)'
|
|
||||||
header_string = '#include <omp.h>'
|
|
||||||
libraries = ['gomp']
|
|
||||||
else:
|
|
||||||
pragma_string = ''
|
|
||||||
header_string = ''
|
|
||||||
libraries = []
|
|
||||||
|
|
||||||
mu, S = vp.mean, vp.variance
|
|
||||||
|
|
||||||
N, Q = mu.shape
|
|
||||||
M = Z.shape[0]
|
|
||||||
|
|
||||||
#compute required distances
|
|
||||||
Zhat, Zdist = self._Z_distances(Z)
|
|
||||||
Zdist_sq = np.square(Zdist / self.lengthscale) # M,M,Q
|
|
||||||
|
|
||||||
#allocate memory for the things we want to compute
|
|
||||||
mudist = np.empty((N, M, M, Q))
|
|
||||||
mudist_sq = np.empty((N, M, M, Q))
|
|
||||||
psi2 = np.empty((N, M, M))
|
|
||||||
|
|
||||||
l2 = self.lengthscale **2
|
|
||||||
denom = (2.*S[:,None,None,:] / l2) + 1. # N,Q
|
|
||||||
half_log_denom = 0.5 * np.log(denom[:,0,0,:])
|
|
||||||
denom_l2 = denom[:,0,0,:]*l2
|
|
||||||
|
|
||||||
variance_sq = float(np.square(self.variance))
|
|
||||||
code = """
|
|
||||||
double tmp, exponent_tmp;
|
|
||||||
%s
|
|
||||||
for (int n=0; n<N; n++)
|
|
||||||
{
|
|
||||||
for (int m=0; m<M; m++)
|
|
||||||
{
|
|
||||||
for (int mm=0; mm<(m+1); mm++)
|
|
||||||
{
|
|
||||||
exponent_tmp = 0.0;
|
|
||||||
for (int q=0; q<Q; q++)
|
|
||||||
{
|
|
||||||
//compute mudist
|
|
||||||
tmp = mu(n,q) - Zhat(m,mm,q);
|
|
||||||
mudist(n,m,mm,q) = tmp;
|
|
||||||
mudist(n,mm,m,q) = tmp;
|
|
||||||
|
|
||||||
//now mudist_sq
|
|
||||||
tmp = tmp*tmp/denom_l2(n,q);
|
|
||||||
mudist_sq(n,m,mm,q) = tmp;
|
|
||||||
mudist_sq(n,mm,m,q) = tmp;
|
|
||||||
|
|
||||||
//now exponent
|
|
||||||
tmp = -Zdist_sq(m,mm,q) - tmp - half_log_denom(n,q);
|
|
||||||
exponent_tmp += tmp;
|
|
||||||
}
|
|
||||||
//compute psi2 by exponentiating
|
|
||||||
psi2(n,m,mm) = variance_sq * exp(exponent_tmp);
|
|
||||||
psi2(n,mm,m) = psi2(n,m,mm);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
""" % pragma_string
|
|
||||||
|
|
||||||
support_code = """
|
|
||||||
%s
|
|
||||||
#include <math.h>
|
|
||||||
""" % header_string
|
|
||||||
mu = param_to_array(mu)
|
|
||||||
weave.inline(code, support_code=support_code, libraries=libraries,
|
|
||||||
arg_names=['N', 'M', 'Q', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'denom_l2', 'Zdist_sq', 'half_log_denom', 'psi2', 'variance_sq'],
|
|
||||||
type_converters=weave.converters.blitz, **self.weave_options)
|
|
||||||
|
|
||||||
return Zdist, Zdist_sq, mudist, mudist_sq, psi2
|
|
||||||
|
|
||||||
def _weave_psi2_lengthscale_grads(self, dL_dpsi2, psi2, Zdist_sq, S, mudist_sq, l2):
|
|
||||||
|
|
||||||
#here's the einsum equivalent, it's ~3 times slower
|
|
||||||
#return 2.*np.einsum( 'ijk,ijk,ijkl,il->l', dL_dpsi2, psi2, Zdist_sq * (2.*S[:,None,None,:]/l2 + 1.) + mudist_sq + S[:, None, None, :] / l2, 1./(2.*S + l2))*self.lengthscale
|
|
||||||
|
|
||||||
result = np.zeros(self.input_dim)
|
|
||||||
if config.getboolean('parallel', 'openmp'):
|
|
||||||
pragma_string = '#pragma omp parallel for reduction(+:tmp)'
|
|
||||||
header_string = '#include <omp.h>'
|
|
||||||
libraries = ['gomp']
|
|
||||||
else:
|
|
||||||
pragma_string = ''
|
|
||||||
header_string = ''
|
|
||||||
libraries = []
|
|
||||||
code = """
|
|
||||||
double tmp;
|
|
||||||
for(int q=0; q<Q; q++)
|
|
||||||
{
|
|
||||||
tmp = 0.0;
|
|
||||||
%s
|
|
||||||
for(int n=0; n<N; n++)
|
|
||||||
{
|
|
||||||
for(int m=0; m<M; m++)
|
|
||||||
{
|
|
||||||
//diag terms
|
|
||||||
tmp += dL_dpsi2(n,m,m) * psi2(n,m,m) * (Zdist_sq(m,m,q) * (2.0*S(n,q)/l2(q) + 1.0) + mudist_sq(n,m,m,q) + S(n,q)/l2(q)) / (2.0*S(n,q) + l2(q)) ;
|
|
||||||
|
|
||||||
//off-diag terms
|
|
||||||
for(int mm=0; mm<m; mm++)
|
|
||||||
{
|
|
||||||
tmp += 2.0 * dL_dpsi2(n,m,mm) * psi2(n,m,mm) * (Zdist_sq(m,mm,q) * (2.0*S(n,q)/l2(q) + 1.0) + mudist_sq(n,m,mm,q) + S(n,q)/l2(q)) / (2.0*S(n,q) + l2(q)) ;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
result(q) = tmp;
|
|
||||||
}
|
|
||||||
|
|
||||||
""" % pragma_string
|
|
||||||
support_code = """
|
|
||||||
%s
|
|
||||||
#include <math.h>
|
|
||||||
""" % header_string
|
|
||||||
N,Q = S.shape
|
|
||||||
M = psi2.shape[-1]
|
|
||||||
|
|
||||||
S = param_to_array(S)
|
|
||||||
weave.inline(code, support_code=support_code, libraries=libraries,
|
|
||||||
arg_names=['psi2', 'dL_dpsi2', 'N', 'M', 'Q', 'mudist_sq', 'l2', 'Zdist_sq', 'S', 'result'],
|
|
||||||
type_converters=weave.converters.blitz, **self.weave_options)
|
|
||||||
|
|
||||||
return 2.*result*self.lengthscale
|
|
||||||
|
|
|
||||||
|
|
@ -1,139 +0,0 @@
|
||||||
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
|
||||||
|
|
||||||
|
|
||||||
from kern import Kern
|
|
||||||
import numpy as np
|
|
||||||
from ...util.linalg import tdot
|
|
||||||
from ...util.config import *
|
|
||||||
from stationary import Stationary
|
|
||||||
from psi_comp import ssrbf_psi_comp
|
|
||||||
|
|
||||||
class SSRBF(Stationary):
|
|
||||||
"""
|
|
||||||
Radial Basis Function kernel, aka squared-exponential, exponentiated quadratic or Gaussian kernel
|
|
||||||
for Spike-and-Slab GPLVM
|
|
||||||
|
|
||||||
.. math::
|
|
||||||
|
|
||||||
k(r) = \sigma^2 \exp \\bigg(- \\frac{1}{2} r^2 \\bigg) \ \ \ \ \ \\text{ where } r^2 = \sum_{i=1}^d \\frac{ (x_i-x^\prime_i)^2}{\ell_i^2}
|
|
||||||
|
|
||||||
where \ell_i is the lengthscale, \sigma^2 the variance and d the dimensionality of the input.
|
|
||||||
|
|
||||||
:param input_dim: the number of input dimensions
|
|
||||||
:type input_dim: int
|
|
||||||
:param variance: the variance of the kernel
|
|
||||||
:type variance: float
|
|
||||||
:param lengthscale: the vector of lengthscale of the kernel
|
|
||||||
:type lengthscale: array or list of the appropriate size (or float if there is only one lengthscale parameter)
|
|
||||||
:param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one single lengthscale parameter \ell), otherwise there is one lengthscale parameter per dimension.
|
|
||||||
:type ARD: Boolean
|
|
||||||
:rtype: kernel object
|
|
||||||
|
|
||||||
.. Note: this object implements both the ARD and 'spherical' version of the function
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, input_dim, variance=1., lengthscale=None, ARD=True, active_dims=None, name='SSRBF'):
|
|
||||||
assert ARD==True, "Not Implemented!"
|
|
||||||
super(SSRBF, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name)
|
|
||||||
|
|
||||||
def K_of_r(self, r):
|
|
||||||
return self.variance * np.exp(-0.5 * r**2)
|
|
||||||
|
|
||||||
def dK_dr(self, r):
|
|
||||||
return -r*self.K_of_r(r)
|
|
||||||
|
|
||||||
def parameters_changed(self):
|
|
||||||
pass
|
|
||||||
|
|
||||||
def Kdiag(self, X):
|
|
||||||
ret = np.empty(X.shape[0])
|
|
||||||
ret[:] = self.variance
|
|
||||||
return ret
|
|
||||||
|
|
||||||
#---------------------------------------#
|
|
||||||
# PSI statistics #
|
|
||||||
#---------------------------------------#
|
|
||||||
|
|
||||||
def psi0(self, Z, variational_posterior):
|
|
||||||
ret = np.empty(variational_posterior.mean.shape[0])
|
|
||||||
ret[:] = self.variance
|
|
||||||
return ret
|
|
||||||
|
|
||||||
def psi1(self, Z, variational_posterior):
|
|
||||||
_psi1, _, _, _, _, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
return _psi1
|
|
||||||
|
|
||||||
def psi2(self, Z, variational_posterior):
|
|
||||||
_psi2, _, _, _, _, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
return _psi2
|
|
||||||
|
|
||||||
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
|
||||||
_, _dpsi1_dvariance, _, _, _, _, _dpsi1_dlengthscale = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
_, _dpsi2_dvariance, _, _, _, _, _dpsi2_dlengthscale = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
|
|
||||||
#contributions from psi0:
|
|
||||||
self.variance.gradient = np.sum(dL_dpsi0)
|
|
||||||
|
|
||||||
#from psi1
|
|
||||||
self.variance.gradient += np.sum(dL_dpsi1 * _dpsi1_dvariance)
|
|
||||||
self.lengthscale.gradient = (dL_dpsi1[:,:,None]*_dpsi1_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
|
|
||||||
|
|
||||||
|
|
||||||
#from psi2
|
|
||||||
self.variance.gradient += (dL_dpsi2 * _dpsi2_dvariance).sum()
|
|
||||||
self.lengthscale.gradient += (dL_dpsi2[:,:,:,None] * _dpsi2_dlengthscale).reshape(-1,self.input_dim).sum(axis=0)
|
|
||||||
|
|
||||||
def gradients_Z_expectations(self, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
|
||||||
_, _, _, _, _, _dpsi1_dZ, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
_, _, _, _, _, _dpsi2_dZ, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
|
|
||||||
#psi1
|
|
||||||
grad = (dL_dpsi1[:, :, None] * _dpsi1_dZ).sum(axis=0)
|
|
||||||
|
|
||||||
#psi2
|
|
||||||
grad += (dL_dpsi2[:, :, :, None] * _dpsi2_dZ).sum(axis=0).sum(axis=1)
|
|
||||||
|
|
||||||
return grad
|
|
||||||
|
|
||||||
def gradients_qX_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
|
||||||
ndata = variational_posterior.mean.shape[0]
|
|
||||||
|
|
||||||
_, _, _dpsi1_dgamma, _dpsi1_dmu, _dpsi1_dS, _, _ = ssrbf_psi_comp._psi1computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
_, _, _dpsi2_dgamma, _dpsi2_dmu, _dpsi2_dS, _, _ = ssrbf_psi_comp._psi2computations(self.variance, self.lengthscale, Z, variational_posterior.mean, variational_posterior.variance, variational_posterior.binary_prob)
|
|
||||||
|
|
||||||
#psi1
|
|
||||||
grad_mu = (dL_dpsi1[:, :, None] * _dpsi1_dmu).sum(axis=1)
|
|
||||||
grad_S = (dL_dpsi1[:, :, None] * _dpsi1_dS).sum(axis=1)
|
|
||||||
grad_gamma = (dL_dpsi1[:,:,None] * _dpsi1_dgamma).sum(axis=1)
|
|
||||||
#psi2
|
|
||||||
grad_mu += (dL_dpsi2[:, :, :, None] * _dpsi2_dmu).reshape(ndata,-1,self.input_dim).sum(axis=1)
|
|
||||||
grad_S += (dL_dpsi2[:, :, :, None] * _dpsi2_dS).reshape(ndata,-1,self.input_dim).sum(axis=1)
|
|
||||||
grad_gamma += (dL_dpsi2[:,:,:, None] * _dpsi2_dgamma).reshape(ndata,-1,self.input_dim).sum(axis=1)
|
|
||||||
|
|
||||||
return grad_mu, grad_S, grad_gamma
|
|
||||||
|
|
||||||
#---------------------------------------#
|
|
||||||
# Precomputations #
|
|
||||||
#---------------------------------------#
|
|
||||||
|
|
||||||
#@cache_this(1)
|
|
||||||
def _K_computations(self, X, X2):
|
|
||||||
"""
|
|
||||||
K(X,X2) - X is NxQ
|
|
||||||
Q -> input dimension (self.input_dim)
|
|
||||||
"""
|
|
||||||
if X2 is None:
|
|
||||||
self._X2 = None
|
|
||||||
|
|
||||||
X = X / self.lengthscale
|
|
||||||
Xsquare = np.sum(np.square(X), axis=1)
|
|
||||||
self._K_dist2 = -2.*tdot(X) + (Xsquare[:, None] + Xsquare[None, :])
|
|
||||||
else:
|
|
||||||
self._X2 = X2.copy()
|
|
||||||
|
|
||||||
X = X / self.lengthscale
|
|
||||||
X2 = X2 / self.lengthscale
|
|
||||||
self._K_dist2 = -2.*np.dot(X, X2.T) + (np.sum(np.square(X), axis=1)[:, None] + np.sum(np.square(X2), axis=1)[None, :])
|
|
||||||
self._K_dvar = np.exp(-0.5 * self._K_dist2)
|
|
||||||
|
|
||||||
|
|
@ -6,7 +6,6 @@ from kern import Kern
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from ...core.parameterization import Param
|
from ...core.parameterization import Param
|
||||||
from ...core.parameterization.transformations import Logexp
|
from ...core.parameterization.transformations import Logexp
|
||||||
import numpy as np
|
|
||||||
|
|
||||||
class Static(Kern):
|
class Static(Kern):
|
||||||
def __init__(self, input_dim, variance, active_dims, name):
|
def __init__(self, input_dim, variance, active_dims, name):
|
||||||
|
|
@ -39,7 +38,7 @@ class Static(Kern):
|
||||||
|
|
||||||
def psi2(self, Z, variational_posterior):
|
def psi2(self, Z, variational_posterior):
|
||||||
K = self.K(variational_posterior.mean, Z)
|
K = self.K(variational_posterior.mean, Z)
|
||||||
return K[:,:,None]*K[:,None,:] # NB. more efficient implementations on inherriting classes
|
return np.einsum('ij,ik->jk',K,K) #K[:,:,None]*K[:,None,:] # NB. more efficient implementations on inherriting classes
|
||||||
|
|
||||||
|
|
||||||
class White(Static):
|
class White(Static):
|
||||||
|
|
@ -53,7 +52,7 @@ class White(Static):
|
||||||
return np.zeros((X.shape[0], X2.shape[0]))
|
return np.zeros((X.shape[0], X2.shape[0]))
|
||||||
|
|
||||||
def psi2(self, Z, variational_posterior):
|
def psi2(self, Z, variational_posterior):
|
||||||
return np.zeros((variational_posterior.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
|
return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
|
||||||
|
|
||||||
def update_gradients_full(self, dL_dK, X, X2=None):
|
def update_gradients_full(self, dL_dK, X, X2=None):
|
||||||
self.variance.gradient = np.trace(dL_dK)
|
self.variance.gradient = np.trace(dL_dK)
|
||||||
|
|
@ -82,12 +81,12 @@ class Bias(Static):
|
||||||
self.variance.gradient = dL_dKdiag.sum()
|
self.variance.gradient = dL_dKdiag.sum()
|
||||||
|
|
||||||
def psi2(self, Z, variational_posterior):
|
def psi2(self, Z, variational_posterior):
|
||||||
ret = np.empty((variational_posterior.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
|
ret = np.empty((Z.shape[0], Z.shape[0]), dtype=np.float64)
|
||||||
ret[:] = self.variance**2
|
ret[:] = self.variance*self.variance*variational_posterior.shape[0]
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
||||||
self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()
|
self.variance.gradient = dL_dpsi0.sum() + dL_dpsi1.sum() + 2.*self.variance*dL_dpsi2.sum()*variational_posterior.shape[0]
|
||||||
|
|
||||||
class Fixed(Static):
|
class Fixed(Static):
|
||||||
def __init__(self, input_dim, covariance_matrix, variance=1., active_dims=None, name='fixed'):
|
def __init__(self, input_dim, covariance_matrix, variance=1., active_dims=None, name='fixed'):
|
||||||
|
|
@ -97,7 +96,7 @@ class Fixed(Static):
|
||||||
:param variance: the variance of the kernel
|
:param variance: the variance of the kernel
|
||||||
:type variance: float
|
:type variance: float
|
||||||
"""
|
"""
|
||||||
super(Bias, self).__init__(input_dim, variance, active_dims, name)
|
super(Fixed, self).__init__(input_dim, variance, active_dims, name)
|
||||||
self.fixed_K = covariance_matrix
|
self.fixed_K = covariance_matrix
|
||||||
def K(self, X, X2):
|
def K(self, X, X2):
|
||||||
return self.variance * self.fixed_K
|
return self.variance * self.fixed_K
|
||||||
|
|
@ -112,7 +111,7 @@ class Fixed(Static):
|
||||||
self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.fixed_K)
|
self.variance.gradient = np.einsum('i,i', dL_dKdiag, self.fixed_K)
|
||||||
|
|
||||||
def psi2(self, Z, variational_posterior):
|
def psi2(self, Z, variational_posterior):
|
||||||
return np.zeros((variational_posterior.shape[0], Z.shape[0], Z.shape[0]), dtype=np.float64)
|
return np.zeros((Z.shape[0], Z.shape[0]), dtype=np.float64)
|
||||||
|
|
||||||
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
def update_gradients_expectations(self, dL_dpsi0, dL_dpsi1, dL_dpsi2, Z, variational_posterior):
|
||||||
self.variance.gradient = dL_dpsi0.sum()
|
self.variance.gradient = dL_dpsi0.sum()
|
||||||
|
|
|
||||||
|
|
@ -79,7 +79,6 @@ class Stationary(Kern):
|
||||||
#a convenience function, so we can cache dK_dr
|
#a convenience function, so we can cache dK_dr
|
||||||
return self.dK_dr(self._scaled_dist(X, X2))
|
return self.dK_dr(self._scaled_dist(X, X2))
|
||||||
|
|
||||||
@Cache_this(limit=5, ignore_args=(0,))
|
|
||||||
def _unscaled_dist(self, X, X2=None):
|
def _unscaled_dist(self, X, X2=None):
|
||||||
"""
|
"""
|
||||||
Compute the Euclidean distance between each row of X and X2, or between
|
Compute the Euclidean distance between each row of X and X2, or between
|
||||||
|
|
@ -169,11 +168,18 @@ class Stationary(Kern):
|
||||||
|
|
||||||
#the lower memory way with a loop
|
#the lower memory way with a loop
|
||||||
tmp = invdist*dL_dr
|
tmp = invdist*dL_dr
|
||||||
if X2 is None:
|
|
||||||
tmp *= 2.
|
|
||||||
X2 = X
|
|
||||||
ret = np.empty(X.shape, dtype=np.float64)
|
ret = np.empty(X.shape, dtype=np.float64)
|
||||||
|
<<<<<<< HEAD
|
||||||
[np.sum(tmp*(X[:,q:q+1]-X2[:,q:q+1]), axis=1, out=ret[:,q]) for q in xrange(self.input_dim)]
|
[np.sum(tmp*(X[:,q:q+1]-X2[:,q:q+1]), axis=1, out=ret[:,q]) for q in xrange(self.input_dim)]
|
||||||
|
=======
|
||||||
|
if X2 is None:
|
||||||
|
[np.einsum('ij,ij->i', tmp, X[:,q][:,None]-X[:,q][None,:], out=ret[:,q]) for q in xrange(self.input_dim)]
|
||||||
|
ret2 = np.empty(X.shape, dtype=np.float64)
|
||||||
|
[np.einsum('ij,ji->j', tmp, X[:,q][:,None]-X[:,q][None,:], out=ret2[:,q]) for q in xrange(self.input_dim)]
|
||||||
|
ret += ret2
|
||||||
|
else:
|
||||||
|
[np.einsum('ij,ij->i', tmp, X[:,q][:,None]-X2[:,q][None,:], out=ret[:,q]) for q in xrange(self.input_dim)]
|
||||||
|
>>>>>>> 1061bf52482aa3bf6769db810c955d5fbf51ceae
|
||||||
ret /= self.lengthscale**2
|
ret /= self.lengthscale**2
|
||||||
return ret
|
return ret
|
||||||
|
|
||||||
|
|
|
||||||
208
GPy/kern/_src/trunclinear.py
Normal file
208
GPy/kern/_src/trunclinear.py
Normal file
|
|
@ -0,0 +1,208 @@
|
||||||
|
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
||||||
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
from scipy import weave
|
||||||
|
from kern import Kern
|
||||||
|
from ...util.linalg import tdot
|
||||||
|
from ...util.misc import param_to_array
|
||||||
|
from ...core.parameterization import Param
|
||||||
|
from ...core.parameterization.transformations import Logexp
|
||||||
|
from ...util.caching import Cache_this
|
||||||
|
from ...core.parameterization import variational
|
||||||
|
from ...util.config import *
|
||||||
|
|
||||||
|
class TruncLinear(Kern):
|
||||||
|
"""
|
||||||
|
Truncated Linear kernel
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
|
||||||
|
k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
|
||||||
|
|
||||||
|
:param input_dim: the number of input dimensions
|
||||||
|
:type input_dim: int
|
||||||
|
:param variances: the vector of variances :math:`\sigma^2_i`
|
||||||
|
:type variances: array or list of the appropriate size (or float if there
|
||||||
|
is only one variance parameter)
|
||||||
|
:param ARD: Auto Relevance Determination. If False, the kernel has only one
|
||||||
|
variance parameter \sigma^2, otherwise there is one variance
|
||||||
|
parameter per dimension.
|
||||||
|
:type ARD: Boolean
|
||||||
|
:rtype: kernel object
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input_dim, variances=None, delta=None, ARD=False, active_dims=None, name='linear'):
|
||||||
|
super(TruncLinear, self).__init__(input_dim, active_dims, name)
|
||||||
|
self.ARD = ARD
|
||||||
|
if not ARD:
|
||||||
|
if variances is not None:
|
||||||
|
variances = np.asarray(variances)
|
||||||
|
delta = np.asarray(delta)
|
||||||
|
assert variances.size == 1, "Only one variance needed for non-ARD kernel"
|
||||||
|
else:
|
||||||
|
variances = np.ones(1)
|
||||||
|
delta = np.zeros(1)
|
||||||
|
else:
|
||||||
|
if variances is not None:
|
||||||
|
variances = np.asarray(variances)
|
||||||
|
delta = np.asarray(delta)
|
||||||
|
assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
|
||||||
|
else:
|
||||||
|
variances = np.ones(self.input_dim)
|
||||||
|
delta = np.zeros(self.input_dim)
|
||||||
|
|
||||||
|
self.variances = Param('variances', variances, Logexp())
|
||||||
|
self.delta = Param('delta', delta)
|
||||||
|
self.add_parameter(self.variances)
|
||||||
|
self.add_parameter(self.delta)
|
||||||
|
|
||||||
|
@Cache_this(limit=2)
|
||||||
|
def K(self, X, X2=None):
|
||||||
|
XX = self.variances*self._product(X, X2)
|
||||||
|
return XX.sum(axis=-1)
|
||||||
|
|
||||||
|
@Cache_this(limit=2)
|
||||||
|
def _product(self, X, X2=None):
|
||||||
|
if X2 is None:
|
||||||
|
X2 = X
|
||||||
|
XX = np.einsum('nq,mq->nmq',X-self.delta,X2-self.delta)
|
||||||
|
XX[XX<0] = 0
|
||||||
|
return XX
|
||||||
|
|
||||||
|
def Kdiag(self, X):
|
||||||
|
return (self.variances*np.square(X-self.delta)).sum(axis=-1)
|
||||||
|
|
||||||
|
def update_gradients_full(self, dL_dK, X, X2=None):
|
||||||
|
dK_dvar = self._product(X, X2)
|
||||||
|
if X2 is None:
|
||||||
|
X2=X
|
||||||
|
dK_ddelta = self.variances*(2*self.delta-X[:,None,:]-X2[None,:,:])*(dK_dvar>0)
|
||||||
|
if self.ARD:
|
||||||
|
self.variances.gradient[:] = np.einsum('nmq,nm->q',dK_dvar,dL_dK)
|
||||||
|
self.delta.gradient[:] = np.einsum('nmq,nm->q',dK_ddelta,dL_dK)
|
||||||
|
else:
|
||||||
|
self.variances.gradient[:] = np.einsum('nmq,nm->',dK_dvar,dL_dK)
|
||||||
|
self.delta.gradient[:] = np.einsum('nmq,nm->',dK_ddelta,dL_dK)
|
||||||
|
|
||||||
|
def update_gradients_diag(self, dL_dKdiag, X):
|
||||||
|
if self.ARD:
|
||||||
|
self.variances.gradient[:] = np.einsum('nq,n->q',np.square(X-self.delta),dL_dKdiag)
|
||||||
|
self.delta.gradient[:] = np.einsum('nq,n->q',2*self.variances*(self.delta-X),dL_dKdiag)
|
||||||
|
else:
|
||||||
|
self.variances.gradient[:] = np.einsum('nq,n->',np.square(X-self.delta),dL_dKdiag)
|
||||||
|
self.delta.gradient[:] = np.einsum('nq,n->',2*self.variances*(self.delta-X),dL_dKdiag)
|
||||||
|
|
||||||
|
def gradients_X(self, dL_dK, X, X2=None):
|
||||||
|
XX = self._product(X, X2)
|
||||||
|
if X2 is None:
|
||||||
|
Xp = (self.variances*(X-self.delta))*(XX>0)
|
||||||
|
else:
|
||||||
|
Xp = (self.variances*(X2-self.delta))*(XX>0)
|
||||||
|
if X2 is None:
|
||||||
|
return np.einsum('nmq,nm->nq',Xp,dL_dK)+np.einsum('mnq,nm->mq',Xp,dL_dK)
|
||||||
|
else:
|
||||||
|
return np.einsum('nmq,nm->nq',Xp,dL_dK)
|
||||||
|
|
||||||
|
def gradients_X_diag(self, dL_dKdiag, X):
|
||||||
|
return 2.*self.variances*dL_dKdiag[:,None]*(X-self.delta)
|
||||||
|
|
||||||
|
def input_sensitivity(self):
|
||||||
|
return np.ones(self.input_dim) * self.variances
|
||||||
|
|
||||||
|
class TruncLinear_inf(Kern):
|
||||||
|
"""
|
||||||
|
Truncated Linear kernel
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
|
||||||
|
k(x,y) = \sum_{i=1}^input_dim \sigma^2_i \max(0, x_iy_i - \simga_q)
|
||||||
|
|
||||||
|
:param input_dim: the number of input dimensions
|
||||||
|
:type input_dim: int
|
||||||
|
:param variances: the vector of variances :math:`\sigma^2_i`
|
||||||
|
:type variances: array or list of the appropriate size (or float if there
|
||||||
|
is only one variance parameter)
|
||||||
|
:param ARD: Auto Relevance Determination. If False, the kernel has only one
|
||||||
|
variance parameter \sigma^2, otherwise there is one variance
|
||||||
|
parameter per dimension.
|
||||||
|
:type ARD: Boolean
|
||||||
|
:rtype: kernel object
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, input_dim, interval, variances=None, ARD=False, active_dims=None, name='linear'):
|
||||||
|
super(TruncLinear_inf, self).__init__(input_dim, active_dims, name)
|
||||||
|
self.interval = interval
|
||||||
|
self.ARD = ARD
|
||||||
|
if not ARD:
|
||||||
|
if variances is not None:
|
||||||
|
variances = np.asarray(variances)
|
||||||
|
assert variances.size == 1, "Only one variance needed for non-ARD kernel"
|
||||||
|
else:
|
||||||
|
variances = np.ones(1)
|
||||||
|
else:
|
||||||
|
if variances is not None:
|
||||||
|
variances = np.asarray(variances)
|
||||||
|
assert variances.size == self.input_dim, "bad number of variances, need one ARD variance per input_dim"
|
||||||
|
else:
|
||||||
|
variances = np.ones(self.input_dim)
|
||||||
|
|
||||||
|
self.variances = Param('variances', variances, Logexp())
|
||||||
|
self.add_parameter(self.variances)
|
||||||
|
|
||||||
|
|
||||||
|
# @Cache_this(limit=2)
|
||||||
|
def K(self, X, X2=None):
|
||||||
|
tmp = self._product(X, X2)
|
||||||
|
return (self.variances*tmp).sum(axis=-1)
|
||||||
|
|
||||||
|
# @Cache_this(limit=2)
|
||||||
|
def _product(self, X, X2=None):
|
||||||
|
if X2 is None:
|
||||||
|
X2 = X
|
||||||
|
X_X2 = X[:,None,:]-X2[None,:,:]
|
||||||
|
tmp = np.abs(X_X2**3)/6+np.einsum('nq,mq->nmq',X,X2)*(self.interval[1]-self.interval[0]) \
|
||||||
|
-(X[:,None,:]+X2[None,:,:])*(self.interval[1]*self.interval[1]-self.interval[0]*self.interval[0])/2+(self.interval[1]**3-self.interval[0]**3)/3.
|
||||||
|
return tmp
|
||||||
|
|
||||||
|
def Kdiag(self, X):
|
||||||
|
tmp = np.square(X)*(self.interval[1]-self.interval[0]) \
|
||||||
|
-X*(self.interval[1]*self.interval[1]-self.interval[0]*self.interval[0])+(self.interval[1]**3-self.interval[0]**3)/3
|
||||||
|
return (self.variances*tmp).sum(axis=-1)
|
||||||
|
|
||||||
|
def update_gradients_full(self, dL_dK, X, X2=None):
|
||||||
|
dK_dvar = self._product(X, X2)
|
||||||
|
if self.ARD:
|
||||||
|
self.variances.gradient[:] = np.einsum('nmq,nm->q',dK_dvar,dL_dK)
|
||||||
|
else:
|
||||||
|
self.variances.gradient[:] = np.einsum('nmq,nm->',dK_dvar,dL_dK)
|
||||||
|
|
||||||
|
def update_gradients_diag(self, dL_dKdiag, X):
|
||||||
|
tmp = np.square(X)*(self.interval[1]-self.interval[0]) \
|
||||||
|
-X*(self.interval[1]*self.interval[1]-self.interval[0]*self.interval[0])+(self.interval[1]**3-self.interval[0]**3)/3
|
||||||
|
if self.ARD:
|
||||||
|
self.variances.gradient[:] = np.einsum('nq,n->q',tmp,dL_dKdiag)
|
||||||
|
else:
|
||||||
|
self.variances.gradient[:] = np.einsum('nq,n->',tmp,dL_dKdiag)
|
||||||
|
|
||||||
|
def gradients_X(self, dL_dK, X, X2=None):
|
||||||
|
XX = self._product(X, X2)
|
||||||
|
if X2 is None:
|
||||||
|
Xp = (self.variances*(X-self.delta))*(XX>0)
|
||||||
|
else:
|
||||||
|
Xp = (self.variances*(X2-self.delta))*(XX>0)
|
||||||
|
if X2 is None:
|
||||||
|
return np.einsum('nmq,nm->nq',Xp,dL_dK)+np.einsum('mnq,nm->mq',Xp,dL_dK)
|
||||||
|
else:
|
||||||
|
return np.einsum('nmq,nm->nq',Xp,dL_dK)
|
||||||
|
|
||||||
|
def gradients_X_diag(self, dL_dKdiag, X):
|
||||||
|
return 2.*self.variances*dL_dKdiag[:,None]*(X-self.delta)
|
||||||
|
|
||||||
|
def input_sensitivity(self):
|
||||||
|
return np.ones(self.input_dim) * self.variances
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -17,3 +17,4 @@ from ss_gplvm import SSGPLVM
|
||||||
from gp_coregionalized_regression import GPCoregionalizedRegression
|
from gp_coregionalized_regression import GPCoregionalizedRegression
|
||||||
from sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
|
from sparse_gp_coregionalized_regression import SparseGPCoregionalizedRegression
|
||||||
from gp_heteroscedastic_regression import GPHeteroscedasticRegression
|
from gp_heteroscedastic_regression import GPHeteroscedasticRegression
|
||||||
|
from ss_mrd import SSMRD
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ from ..likelihoods import Gaussian
|
||||||
from ..inference.optimization import SCG
|
from ..inference.optimization import SCG
|
||||||
from ..util import linalg
|
from ..util import linalg
|
||||||
from ..core.parameterization.variational import NormalPosterior, NormalPrior, VariationalPosterior
|
from ..core.parameterization.variational import NormalPosterior, NormalPrior, VariationalPosterior
|
||||||
from ..inference.latent_function_inference.var_dtc_parallel import update_gradients
|
from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
|
||||||
from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
|
from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
|
|
@ -25,7 +25,9 @@ class BayesianGPLVM(SparseGP):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
|
def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
|
||||||
Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', **kwargs):
|
Z=None, kernel=None, inference_method=None, likelihood=None, name='bayesian gplvm', mpi_comm=None, **kwargs):
|
||||||
|
self.mpi_comm = mpi_comm
|
||||||
|
self.__IN_OPTIMIZATION__ = False
|
||||||
self.logger = logging.getLogger(self.__class__.__name__)
|
self.logger = logging.getLogger(self.__class__.__name__)
|
||||||
if X == None:
|
if X == None:
|
||||||
from ..util.initialization import initialize_latent
|
from ..util.initialization import initialize_latent
|
||||||
|
|
@ -61,22 +63,42 @@ class BayesianGPLVM(SparseGP):
|
||||||
from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
|
from ..inference.latent_function_inference.var_dtc import VarDTCMissingData
|
||||||
self.logger.debug("creating inference_method with var_dtc missing data")
|
self.logger.debug("creating inference_method with var_dtc missing data")
|
||||||
inference_method = VarDTCMissingData(inan=inan)
|
inference_method = VarDTCMissingData(inan=inan)
|
||||||
|
elif mpi_comm is not None:
|
||||||
|
inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
|
||||||
else:
|
else:
|
||||||
from ..inference.latent_function_inference.var_dtc import VarDTC
|
from ..inference.latent_function_inference.var_dtc import VarDTC
|
||||||
self.logger.debug("creating inference_method var_dtc")
|
self.logger.debug("creating inference_method var_dtc")
|
||||||
inference_method = VarDTC()
|
inference_method = VarDTC()
|
||||||
|
if isinstance(inference_method,VarDTC_minibatch):
|
||||||
|
inference_method.mpi_comm = mpi_comm
|
||||||
|
|
||||||
|
if kernel.useGPU and isinstance(inference_method, VarDTC_GPU):
|
||||||
|
kernel.psicomp.GPU_direct = True
|
||||||
|
|
||||||
SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
|
SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
|
||||||
self.logger.info("Adding X as parameter")
|
self.logger.info("Adding X as parameter")
|
||||||
self.add_parameter(self.X, index=0)
|
self.add_parameter(self.X, index=0)
|
||||||
|
|
||||||
|
if mpi_comm != None:
|
||||||
|
from ..util.mpi import divide_data
|
||||||
|
N_start, N_end, N_list = divide_data(Y.shape[0], mpi_comm)
|
||||||
|
self.N_range = (N_start, N_end)
|
||||||
|
self.N_list = np.array(N_list)
|
||||||
|
self.Y_local = self.Y[N_start:N_end]
|
||||||
|
print 'MPI RANK: '+str(self.mpi_comm.rank)+' with datasize: '+str(self.N_range)
|
||||||
|
mpi_comm.Bcast(self.param_array, root=0)
|
||||||
|
|
||||||
def set_X_gradients(self, X, X_grad):
|
def set_X_gradients(self, X, X_grad):
|
||||||
"""Set the gradients of the posterior distribution of X in its specific form."""
|
"""Set the gradients of the posterior distribution of X in its specific form."""
|
||||||
X.mean.gradient, X.variance.gradient = X_grad
|
X.mean.gradient, X.variance.gradient = X_grad
|
||||||
|
|
||||||
|
def get_X_gradients(self, X):
|
||||||
|
"""Get the gradients of the posterior distribution of X in its specific form."""
|
||||||
|
return X.mean.gradient, X.variance.gradient
|
||||||
|
|
||||||
def parameters_changed(self):
|
def parameters_changed(self):
|
||||||
if isinstance(self.inference_method, VarDTC_GPU):
|
if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
|
||||||
update_gradients(self)
|
update_gradients(self, mpi_comm=self.mpi_comm)
|
||||||
return
|
return
|
||||||
|
|
||||||
super(BayesianGPLVM, self).parameters_changed()
|
super(BayesianGPLVM, self).parameters_changed()
|
||||||
|
|
@ -188,6 +210,50 @@ class BayesianGPLVM(SparseGP):
|
||||||
from ..plotting.matplot_dep import dim_reduction_plots
|
from ..plotting.matplot_dep import dim_reduction_plots
|
||||||
|
|
||||||
return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs)
|
return dim_reduction_plots.plot_steepest_gradient_map(self,*args,**kwargs)
|
||||||
|
def __getstate__(self):
|
||||||
|
dc = super(BayesianGPLVM, self).__getstate__()
|
||||||
|
dc['mpi_comm'] = None
|
||||||
|
if self.mpi_comm != None:
|
||||||
|
del dc['N_range']
|
||||||
|
del dc['N_list']
|
||||||
|
del dc['Y_local']
|
||||||
|
return dc
|
||||||
|
|
||||||
|
def __setstate__(self, state):
|
||||||
|
return super(BayesianGPLVM, self).__setstate__(state)
|
||||||
|
|
||||||
|
#=====================================================
|
||||||
|
# The MPI parallelization
|
||||||
|
# - can move to model at some point
|
||||||
|
#=====================================================
|
||||||
|
|
||||||
|
def _set_params_transformed(self, p):
|
||||||
|
if self.mpi_comm != None:
|
||||||
|
if self.__IN_OPTIMIZATION__ and self.mpi_comm.rank==0:
|
||||||
|
self.mpi_comm.Bcast(np.int32(1),root=0)
|
||||||
|
self.mpi_comm.Bcast(p, root=0)
|
||||||
|
super(BayesianGPLVM, self)._set_params_transformed(p)
|
||||||
|
|
||||||
|
def optimize(self, optimizer=None, start=None, **kwargs):
|
||||||
|
self.__IN_OPTIMIZATION__ = True
|
||||||
|
if self.mpi_comm==None:
|
||||||
|
super(BayesianGPLVM, self).optimize(optimizer,start,**kwargs)
|
||||||
|
elif self.mpi_comm.rank==0:
|
||||||
|
super(BayesianGPLVM, self).optimize(optimizer,start,**kwargs)
|
||||||
|
self.mpi_comm.Bcast(np.int32(-1),root=0)
|
||||||
|
elif self.mpi_comm.rank>0:
|
||||||
|
x = self._get_params_transformed().copy()
|
||||||
|
flag = np.empty(1,dtype=np.int32)
|
||||||
|
while True:
|
||||||
|
self.mpi_comm.Bcast(flag,root=0)
|
||||||
|
if flag==1:
|
||||||
|
self._set_params_transformed(x)
|
||||||
|
elif flag==-1:
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
self.__IN_OPTIMIZATION__ = False
|
||||||
|
raise Exception("Unrecognizable flag for synchronization!")
|
||||||
|
self.__IN_OPTIMIZATION__ = False
|
||||||
|
|
||||||
|
|
||||||
def latent_cost_and_grad(mu_S, input_dim, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
|
def latent_cost_and_grad(mu_S, input_dim, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
|
||||||
|
|
|
||||||
|
|
@ -11,7 +11,7 @@ from GPy.models.gplvm import GPLVM
|
||||||
# from ..core import model
|
# from ..core import model
|
||||||
# from ..util.linalg import pdinv, PCA
|
# from ..util.linalg import pdinv, PCA
|
||||||
|
|
||||||
class SparseGPLVM(SparseGPRegression, GPLVM):
|
class SparseGPLVM(SparseGPRegression):
|
||||||
"""
|
"""
|
||||||
Sparse Gaussian Process Latent Variable Model
|
Sparse Gaussian Process Latent Variable Model
|
||||||
|
|
||||||
|
|
@ -23,40 +23,25 @@ class SparseGPLVM(SparseGPRegression, GPLVM):
|
||||||
:type init: 'PCA'|'random'
|
:type init: 'PCA'|'random'
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, Y, input_dim, kernel=None, init='PCA', num_inducing=10):
|
def __init__(self, Y, input_dim, X=None, kernel=None, init='PCA', num_inducing=10):
|
||||||
X = self.initialise_latent(init, input_dim, Y)
|
if X is None:
|
||||||
|
from ..util.initialization import initialize_latent
|
||||||
|
X, fracs = initialize_latent(init, input_dim, Y)
|
||||||
SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
|
SparseGPRegression.__init__(self, X, Y, kernel=kernel, num_inducing=num_inducing)
|
||||||
self.ensure_default_constraints()
|
|
||||||
|
|
||||||
def _get_param_names(self):
|
def parameters_changed(self):
|
||||||
return (sum([['X_%i_%i' % (n, q) for q in range(self.input_dim)] for n in range(self.num_data)], [])
|
super(SparseGPLVM, self).parameters_changed()
|
||||||
+ SparseGPRegression._get_param_names(self))
|
self.X.gradient = self.kern.gradients_X(self.grad_dict['dL_dKnm'], self.X, self.Z)
|
||||||
|
|
||||||
def _get_params(self):
|
def plot_latent(self, labels=None, which_indices=None,
|
||||||
return np.hstack((self.X.flatten(), SparseGPRegression._get_params(self)))
|
resolution=50, ax=None, marker='o', s=40,
|
||||||
|
fignum=None, plot_inducing=True, legend=True,
|
||||||
|
plot_limits=None,
|
||||||
|
aspect='auto', updates=False, predict_kwargs={}, imshow_kwargs={}):
|
||||||
|
assert "matplotlib" in sys.modules, "matplotlib package has not been imported."
|
||||||
|
from ..plotting.matplot_dep import dim_reduction_plots
|
||||||
|
|
||||||
def _set_params(self, x):
|
return dim_reduction_plots.plot_latent(self, labels, which_indices,
|
||||||
self.X = x[:self.X.size].reshape(self.num_data, self.input_dim).copy()
|
resolution, ax, marker, s,
|
||||||
SparseGPRegression._set_params(self, x[self.X.size:])
|
fignum, plot_inducing, legend,
|
||||||
|
plot_limits, aspect, updates, predict_kwargs, imshow_kwargs)
|
||||||
def log_likelihood(self):
|
|
||||||
return SparseGPRegression.log_likelihood(self)
|
|
||||||
|
|
||||||
def dL_dX(self):
|
|
||||||
dL_dX = self.kern.dKdiag_dX(self.dL_dpsi0, self.X)
|
|
||||||
dL_dX += self.kern.gradients_X(self.dL_dpsi1, self.X, self.Z)
|
|
||||||
|
|
||||||
return dL_dX
|
|
||||||
|
|
||||||
def _log_likelihood_gradients(self):
|
|
||||||
return np.hstack((self.dL_dX().flatten(), SparseGPRegression._log_likelihood_gradients(self)))
|
|
||||||
|
|
||||||
def plot(self):
|
|
||||||
GPLVM.plot(self)
|
|
||||||
# passing Z without a small amout of jitter will induce the white kernel where we don;t want it!
|
|
||||||
mu, var, upper, lower = SparseGPRegression.predict(self, self.Z + np.random.randn(*self.Z.shape) * 0.0001)
|
|
||||||
pb.plot(mu[:, 0] , mu[:, 1], 'ko')
|
|
||||||
|
|
||||||
def plot_latent(self, *args, **kwargs):
|
|
||||||
input_1, input_2 = GPLVM.plot_latent(*args, **kwargs)
|
|
||||||
pb.plot(m.Z[:, input_1], m.Z[:, input_2], '^w')
|
|
||||||
|
|
|
||||||
|
|
@ -2,18 +2,14 @@
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import itertools
|
|
||||||
from matplotlib import pyplot
|
|
||||||
|
|
||||||
from ..core.sparse_gp import SparseGP
|
from ..core.sparse_gp import SparseGP
|
||||||
from .. import kern
|
from .. import kern
|
||||||
from ..likelihoods import Gaussian
|
from ..likelihoods import Gaussian
|
||||||
from ..inference.optimization import SCG
|
|
||||||
from ..util import linalg
|
|
||||||
from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
|
from ..core.parameterization.variational import SpikeAndSlabPrior, SpikeAndSlabPosterior
|
||||||
from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
|
from ..inference.latent_function_inference.var_dtc_parallel import update_gradients, VarDTC_minibatch
|
||||||
from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
|
from ..inference.latent_function_inference.var_dtc_gpu import VarDTC_GPU
|
||||||
|
from ..kern._src.psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF_GPU
|
||||||
|
|
||||||
class SSGPLVM(SparseGP):
|
class SSGPLVM(SparseGP):
|
||||||
"""
|
"""
|
||||||
|
|
@ -28,8 +24,12 @@ class SSGPLVM(SparseGP):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
|
def __init__(self, Y, input_dim, X=None, X_variance=None, init='PCA', num_inducing=10,
|
||||||
Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike-and-Slab GPLVM', group_spike=False, **kwargs):
|
Z=None, kernel=None, inference_method=None, likelihood=None, name='Spike_and_Slab GPLVM', group_spike=False, mpi_comm=None, **kwargs):
|
||||||
|
|
||||||
|
self.mpi_comm = mpi_comm
|
||||||
|
self.__IN_OPTIMIZATION__ = False
|
||||||
|
self.group_spike = group_spike
|
||||||
|
|
||||||
if X == None:
|
if X == None:
|
||||||
from ..util.initialization import initialize_latent
|
from ..util.initialization import initialize_latent
|
||||||
X, fracs = initialize_latent(init, input_dim, Y)
|
X, fracs = initialize_latent(init, input_dim, Y)
|
||||||
|
|
@ -41,46 +41,60 @@ class SSGPLVM(SparseGP):
|
||||||
if X_variance is None: # The variance of the variational approximation (S)
|
if X_variance is None: # The variance of the variational approximation (S)
|
||||||
X_variance = np.random.uniform(0,.1,X.shape)
|
X_variance = np.random.uniform(0,.1,X.shape)
|
||||||
|
|
||||||
gamma = np.empty_like(X, order='F') # The posterior probabilities of the binary variable in the variational approximation
|
gamma = np.empty_like(X) # The posterior probabilities of the binary variable in the variational approximation
|
||||||
gamma[:] = 0.5 + 0.01 * np.random.randn(X.shape[0], input_dim)
|
gamma[:] = 0.5 + 0.1 * np.random.randn(X.shape[0], input_dim)
|
||||||
|
gamma[gamma>1.-1e-9] = 1.-1e-9
|
||||||
if group_spike:
|
gamma[gamma<1e-9] = 1e-9
|
||||||
gamma[:] = gamma.mean(axis=0)
|
|
||||||
|
|
||||||
if Z is None:
|
if Z is None:
|
||||||
Z = np.random.permutation(X.copy())[:num_inducing]
|
Z = np.random.permutation(X.copy())[:num_inducing]
|
||||||
assert Z.shape[1] == X.shape[1]
|
assert Z.shape[1] == X.shape[1]
|
||||||
|
|
||||||
|
pi = np.empty((input_dim))
|
||||||
|
pi[:] = 0.5
|
||||||
|
|
||||||
if likelihood is None:
|
if likelihood is None:
|
||||||
likelihood = Gaussian()
|
likelihood = Gaussian()
|
||||||
|
|
||||||
if kernel is None:
|
if kernel is None:
|
||||||
kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim)
|
kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim)
|
||||||
|
if kernel.useGPU:
|
||||||
pi = np.empty((input_dim))
|
kernel.psicomp = PSICOMP_SSRBF_GPU()
|
||||||
pi[:] = 0.5
|
|
||||||
self.variational_prior = SpikeAndSlabPrior(pi=pi) # the prior probability of the latent binary variable b
|
if inference_method is None:
|
||||||
|
inference_method = VarDTC_minibatch(mpi_comm=mpi_comm)
|
||||||
|
|
||||||
|
self.variational_prior = SpikeAndSlabPrior(pi=pi,learnPi=True) # the prior probability of the latent binary variable b
|
||||||
|
|
||||||
X = np.asfortranarray(X)
|
|
||||||
X_variance = np.asfortranarray(X_variance)
|
|
||||||
gamma = np.asfortranarray(gamma)
|
|
||||||
X = SpikeAndSlabPosterior(X, X_variance, gamma)
|
X = SpikeAndSlabPosterior(X, X_variance, gamma)
|
||||||
|
|
||||||
if group_spike:
|
|
||||||
kernel.group_spike_prob = True
|
|
||||||
self.variational_prior.group_spike_prob = True
|
|
||||||
|
|
||||||
SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
|
SparseGP.__init__(self, X, Y, Z, kernel, likelihood, inference_method, name, **kwargs)
|
||||||
self.add_parameter(self.X, index=0)
|
self.add_parameter(self.X, index=0)
|
||||||
self.add_parameter(self.variational_prior)
|
self.add_parameter(self.variational_prior)
|
||||||
|
|
||||||
|
if mpi_comm != None:
|
||||||
|
from ..util.mpi import divide_data
|
||||||
|
N_start, N_end, N_list = divide_data(Y.shape[0], mpi_comm)
|
||||||
|
self.N_range = (N_start, N_end)
|
||||||
|
self.N_list = np.array(N_list)
|
||||||
|
self.Y_local = self.Y[N_start:N_end]
|
||||||
|
print 'MPI RANK: '+str(self.mpi_comm.rank)+' with datasize: '+str(self.N_range)
|
||||||
|
mpi_comm.Bcast(self.param_array, root=0)
|
||||||
|
|
||||||
|
if self.group_spike:
|
||||||
|
[self.X.gamma[:,i].tie('tieGamma'+str(i)) for i in xrange(self.X.gamma.shape[1])] # Tie columns together
|
||||||
|
|
||||||
def set_X_gradients(self, X, X_grad):
|
def set_X_gradients(self, X, X_grad):
|
||||||
"""Set the gradients of the posterior distribution of X in its specific form."""
|
"""Set the gradients of the posterior distribution of X in its specific form."""
|
||||||
X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad
|
X.mean.gradient, X.variance.gradient, X.binary_prob.gradient = X_grad
|
||||||
|
|
||||||
|
def get_X_gradients(self, X):
|
||||||
|
"""Get the gradients of the posterior distribution of X in its specific form."""
|
||||||
|
return X.mean.gradient, X.variance.gradient, X.binary_prob.gradient
|
||||||
|
|
||||||
def parameters_changed(self):
|
def parameters_changed(self):
|
||||||
if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
|
if isinstance(self.inference_method, VarDTC_GPU) or isinstance(self.inference_method, VarDTC_minibatch):
|
||||||
update_gradients(self)
|
update_gradients(self, mpi_comm=self.mpi_comm)
|
||||||
return
|
return
|
||||||
|
|
||||||
super(SSGPLVM, self).parameters_changed()
|
super(SSGPLVM, self).parameters_changed()
|
||||||
|
|
@ -104,235 +118,47 @@ class SSGPLVM(SparseGP):
|
||||||
|
|
||||||
return dim_reduction_plots.plot_latent(self, plot_inducing=plot_inducing, *args, **kwargs)
|
return dim_reduction_plots.plot_latent(self, plot_inducing=plot_inducing, *args, **kwargs)
|
||||||
|
|
||||||
def do_test_latents(self, Y):
|
def __getstate__(self):
|
||||||
"""
|
dc = super(SSGPLVM, self).__getstate__()
|
||||||
Compute the latent representation for a set of new points Y
|
dc['mpi_comm'] = None
|
||||||
|
if self.mpi_comm != None:
|
||||||
Notes:
|
del dc['N_range']
|
||||||
This will only work with a univariate Gaussian likelihood (for now)
|
del dc['N_list']
|
||||||
"""
|
del dc['Y_local']
|
||||||
assert not self.likelihood.is_heteroscedastic
|
return dc
|
||||||
N_test = Y.shape[0]
|
|
||||||
input_dim = self.Z.shape[1]
|
def __setstate__(self, state):
|
||||||
means = np.zeros((N_test, input_dim))
|
return super(SSGPLVM, self).__setstate__(state)
|
||||||
covars = np.zeros((N_test, input_dim))
|
|
||||||
|
#=====================================================
|
||||||
dpsi0 = -0.5 * self.output_dim * self.likelihood.precision
|
# The MPI parallelization
|
||||||
dpsi2 = self.dL_dpsi2[0][None, :, :] # TODO: this may change if we ignore het. likelihoods
|
# - can move to model at some point
|
||||||
V = self.likelihood.precision * Y
|
#=====================================================
|
||||||
|
|
||||||
#compute CPsi1V
|
def _set_params_transformed(self, p):
|
||||||
if self.Cpsi1V is None:
|
if self.mpi_comm != None:
|
||||||
psi1V = np.dot(self.psi1.T, self.likelihood.V)
|
if self.__IN_OPTIMIZATION__ and self.mpi_comm.rank==0:
|
||||||
tmp, _ = linalg.dtrtrs(self._Lm, np.asfortranarray(psi1V), lower=1, trans=0)
|
self.mpi_comm.Bcast(np.int32(1),root=0)
|
||||||
tmp, _ = linalg.dpotrs(self.LB, tmp, lower=1)
|
self.mpi_comm.Bcast(p, root=0)
|
||||||
self.Cpsi1V, _ = linalg.dtrtrs(self._Lm, tmp, lower=1, trans=1)
|
super(SSGPLVM, self)._set_params_transformed(p)
|
||||||
|
|
||||||
dpsi1 = np.dot(self.Cpsi1V, V.T)
|
def optimize(self, optimizer=None, start=None, **kwargs):
|
||||||
|
self.__IN_OPTIMIZATION__ = True
|
||||||
start = np.zeros(self.input_dim * 2)
|
if self.mpi_comm==None:
|
||||||
|
super(SSGPLVM, self).optimize(optimizer,start,**kwargs)
|
||||||
for n, dpsi1_n in enumerate(dpsi1.T[:, :, None]):
|
elif self.mpi_comm.rank==0:
|
||||||
args = (self.kern, self.Z, dpsi0, dpsi1_n.T, dpsi2)
|
super(SSGPLVM, self).optimize(optimizer,start,**kwargs)
|
||||||
xopt, fopt, neval, status = SCG(f=latent_cost, gradf=latent_grad, x=start, optargs=args, display=False)
|
self.mpi_comm.Bcast(np.int32(-1),root=0)
|
||||||
|
elif self.mpi_comm.rank>0:
|
||||||
mu, log_S = xopt.reshape(2, 1, -1)
|
x = self._get_params_transformed().copy()
|
||||||
means[n] = mu[0].copy()
|
flag = np.empty(1,dtype=np.int32)
|
||||||
covars[n] = np.exp(log_S[0]).copy()
|
while True:
|
||||||
|
self.mpi_comm.Bcast(flag,root=0)
|
||||||
return means, covars
|
if flag==1:
|
||||||
|
self._set_params_transformed(x)
|
||||||
def dmu_dX(self, Xnew):
|
elif flag==-1:
|
||||||
"""
|
break
|
||||||
Calculate the gradient of the prediction at Xnew w.r.t Xnew.
|
else:
|
||||||
"""
|
self.__IN_OPTIMIZATION__ = False
|
||||||
dmu_dX = np.zeros_like(Xnew)
|
raise Exception("Unrecognizable flag for synchronization!")
|
||||||
for i in range(self.Z.shape[0]):
|
self.__IN_OPTIMIZATION__ = False
|
||||||
dmu_dX += self.kern.dK_dX(self.Cpsi1Vf[i:i + 1, :], Xnew, self.Z[i:i + 1, :])
|
|
||||||
return dmu_dX
|
|
||||||
|
|
||||||
def dmu_dXnew(self, Xnew):
|
|
||||||
"""
|
|
||||||
Individual gradient of prediction at Xnew w.r.t. each sample in Xnew
|
|
||||||
"""
|
|
||||||
dK_dX = np.zeros((Xnew.shape[0], self.num_inducing))
|
|
||||||
ones = np.ones((1, 1))
|
|
||||||
for i in range(self.Z.shape[0]):
|
|
||||||
dK_dX[:, i] = self.kern.dK_dX(ones, Xnew, self.Z[i:i + 1, :]).sum(-1)
|
|
||||||
return np.dot(dK_dX, self.Cpsi1Vf)
|
|
||||||
|
|
||||||
def plot_steepest_gradient_map(self, fignum=None, ax=None, which_indices=None, labels=None, data_labels=None, data_marker='o', data_s=40, resolution=20, aspect='auto', updates=False, ** kwargs):
|
|
||||||
input_1, input_2 = significant_dims = most_significant_input_dimensions(self, which_indices)
|
|
||||||
|
|
||||||
X = np.zeros((resolution ** 2, self.input_dim))
|
|
||||||
indices = np.r_[:X.shape[0]]
|
|
||||||
if labels is None:
|
|
||||||
labels = range(self.output_dim)
|
|
||||||
|
|
||||||
def plot_function(x):
|
|
||||||
X[:, significant_dims] = x
|
|
||||||
dmu_dX = self.dmu_dXnew(X)
|
|
||||||
argmax = np.argmax(dmu_dX, 1)
|
|
||||||
return dmu_dX[indices, argmax], np.array(labels)[argmax]
|
|
||||||
|
|
||||||
if ax is None:
|
|
||||||
fig = pyplot.figure(num=fignum)
|
|
||||||
ax = fig.add_subplot(111)
|
|
||||||
|
|
||||||
if data_labels is None:
|
|
||||||
data_labels = np.ones(self.num_data)
|
|
||||||
ulabels = []
|
|
||||||
for lab in data_labels:
|
|
||||||
if not lab in ulabels:
|
|
||||||
ulabels.append(lab)
|
|
||||||
marker = itertools.cycle(list(data_marker))
|
|
||||||
from GPy.util import Tango
|
|
||||||
for i, ul in enumerate(ulabels):
|
|
||||||
if type(ul) is np.string_:
|
|
||||||
this_label = ul
|
|
||||||
elif type(ul) is np.int64:
|
|
||||||
this_label = 'class %i' % ul
|
|
||||||
else:
|
|
||||||
this_label = 'class %i' % i
|
|
||||||
m = marker.next()
|
|
||||||
index = np.nonzero(data_labels == ul)[0]
|
|
||||||
x = self.X[index, input_1]
|
|
||||||
y = self.X[index, input_2]
|
|
||||||
ax.scatter(x, y, marker=m, s=data_s, color=Tango.nextMedium(), label=this_label)
|
|
||||||
|
|
||||||
ax.set_xlabel('latent dimension %i' % input_1)
|
|
||||||
ax.set_ylabel('latent dimension %i' % input_2)
|
|
||||||
|
|
||||||
from matplotlib.cm import get_cmap
|
|
||||||
from GPy.util.latent_space_visualizations.controllers.imshow_controller import ImAnnotateController
|
|
||||||
if not 'cmap' in kwargs.keys():
|
|
||||||
kwargs.update(cmap=get_cmap('jet'))
|
|
||||||
controller = ImAnnotateController(ax,
|
|
||||||
plot_function,
|
|
||||||
tuple(self.X.min(0)[:, significant_dims]) + tuple(self.X.max(0)[:, significant_dims]),
|
|
||||||
resolution=resolution,
|
|
||||||
aspect=aspect,
|
|
||||||
**kwargs)
|
|
||||||
ax.legend()
|
|
||||||
ax.figure.tight_layout()
|
|
||||||
if updates:
|
|
||||||
pyplot.show()
|
|
||||||
clear = raw_input('Enter to continue')
|
|
||||||
if clear.lower() in 'yes' or clear == '':
|
|
||||||
controller.deactivate()
|
|
||||||
return controller.view
|
|
||||||
|
|
||||||
def plot_X_1d(self, fignum=None, ax=None, colors=None):
|
|
||||||
"""
|
|
||||||
Plot latent space X in 1D:
|
|
||||||
|
|
||||||
- if fig is given, create input_dim subplots in fig and plot in these
|
|
||||||
- if ax is given plot input_dim 1D latent space plots of X into each `axis`
|
|
||||||
- if neither fig nor ax is given create a figure with fignum and plot in there
|
|
||||||
|
|
||||||
colors:
|
|
||||||
colors of different latent space dimensions input_dim
|
|
||||||
|
|
||||||
"""
|
|
||||||
import pylab
|
|
||||||
if ax is None:
|
|
||||||
fig = pylab.figure(num=fignum, figsize=(8, min(12, (2 * self.X.shape[1]))))
|
|
||||||
if colors is None:
|
|
||||||
colors = pylab.gca()._get_lines.color_cycle
|
|
||||||
pylab.clf()
|
|
||||||
else:
|
|
||||||
colors = iter(colors)
|
|
||||||
plots = []
|
|
||||||
x = np.arange(self.X.shape[0])
|
|
||||||
for i in range(self.X.shape[1]):
|
|
||||||
if ax is None:
|
|
||||||
a = fig.add_subplot(self.X.shape[1], 1, i + 1)
|
|
||||||
elif isinstance(ax, (tuple, list)):
|
|
||||||
a = ax[i]
|
|
||||||
else:
|
|
||||||
raise ValueError("Need one ax per latent dimnesion input_dim")
|
|
||||||
a.plot(self.X, c='k', alpha=.3)
|
|
||||||
plots.extend(a.plot(x, self.X.T[i], c=colors.next(), label=r"$\mathbf{{X_{{{}}}}}$".format(i)))
|
|
||||||
a.fill_between(x,
|
|
||||||
self.X.T[i] - 2 * np.sqrt(self.X_variance.T[i]),
|
|
||||||
self.X.T[i] + 2 * np.sqrt(self.X_variance.T[i]),
|
|
||||||
facecolor=plots[-1].get_color(),
|
|
||||||
alpha=.3)
|
|
||||||
a.legend(borderaxespad=0.)
|
|
||||||
a.set_xlim(x.min(), x.max())
|
|
||||||
if i < self.X.shape[1] - 1:
|
|
||||||
a.set_xticklabels('')
|
|
||||||
pylab.draw()
|
|
||||||
if ax is None:
|
|
||||||
fig.tight_layout(h_pad=.01) # , rect=(0, 0, 1, .95))
|
|
||||||
return fig
|
|
||||||
|
|
||||||
def getstate(self):
|
|
||||||
"""
|
|
||||||
Get the current state of the class,
|
|
||||||
here just all the indices, rest can get recomputed
|
|
||||||
"""
|
|
||||||
return SparseGP._getstate(self) + [self.init]
|
|
||||||
|
|
||||||
def setstate(self, state):
|
|
||||||
self._const_jitter = None
|
|
||||||
self.init = state.pop()
|
|
||||||
SparseGP._setstate(self, state)
|
|
||||||
|
|
||||||
|
|
||||||
def latent_cost_and_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
|
|
||||||
"""
|
|
||||||
objective function for fitting the latent variables for test points
|
|
||||||
(negative log-likelihood: should be minimised!)
|
|
||||||
"""
|
|
||||||
mu, log_S = mu_S.reshape(2, 1, -1)
|
|
||||||
S = np.exp(log_S)
|
|
||||||
|
|
||||||
psi0 = kern.psi0(Z, mu, S)
|
|
||||||
psi1 = kern.psi1(Z, mu, S)
|
|
||||||
psi2 = kern.psi2(Z, mu, S)
|
|
||||||
|
|
||||||
lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
|
|
||||||
|
|
||||||
mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
|
|
||||||
mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
|
|
||||||
mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
|
|
||||||
|
|
||||||
dmu = mu0 + mu1 + mu2 - mu
|
|
||||||
# dS = S0 + S1 + S2 -0.5 + .5/S
|
|
||||||
dlnS = S * (S0 + S1 + S2 - 0.5) + .5
|
|
||||||
return -lik, -np.hstack((dmu.flatten(), dlnS.flatten()))
|
|
||||||
|
|
||||||
def latent_cost(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
|
|
||||||
"""
|
|
||||||
objective function for fitting the latent variables (negative log-likelihood: should be minimised!)
|
|
||||||
This is the same as latent_cost_and_grad but only for the objective
|
|
||||||
"""
|
|
||||||
mu, log_S = mu_S.reshape(2, 1, -1)
|
|
||||||
S = np.exp(log_S)
|
|
||||||
|
|
||||||
psi0 = kern.psi0(Z, mu, S)
|
|
||||||
psi1 = kern.psi1(Z, mu, S)
|
|
||||||
psi2 = kern.psi2(Z, mu, S)
|
|
||||||
|
|
||||||
lik = dL_dpsi0 * psi0 + np.dot(dL_dpsi1.flatten(), psi1.flatten()) + np.dot(dL_dpsi2.flatten(), psi2.flatten()) - 0.5 * np.sum(np.square(mu) + S) + 0.5 * np.sum(log_S)
|
|
||||||
return -float(lik)
|
|
||||||
|
|
||||||
def latent_grad(mu_S, kern, Z, dL_dpsi0, dL_dpsi1, dL_dpsi2):
|
|
||||||
"""
|
|
||||||
This is the same as latent_cost_and_grad but only for the grad
|
|
||||||
"""
|
|
||||||
mu, log_S = mu_S.reshape(2, 1, -1)
|
|
||||||
S = np.exp(log_S)
|
|
||||||
|
|
||||||
mu0, S0 = kern.dpsi0_dmuS(dL_dpsi0, Z, mu, S)
|
|
||||||
mu1, S1 = kern.dpsi1_dmuS(dL_dpsi1, Z, mu, S)
|
|
||||||
mu2, S2 = kern.dpsi2_dmuS(dL_dpsi2, Z, mu, S)
|
|
||||||
|
|
||||||
dmu = mu0 + mu1 + mu2 - mu
|
|
||||||
# dS = S0 + S1 + S2 -0.5 + .5/S
|
|
||||||
dlnS = S * (S0 + S1 + S2 - 0.5) + .5
|
|
||||||
|
|
||||||
return -np.hstack((dmu.flatten(), dlnS.flatten()))
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
34
GPy/models/ss_mrd.py
Normal file
34
GPy/models/ss_mrd.py
Normal file
|
|
@ -0,0 +1,34 @@
|
||||||
|
"""
|
||||||
|
The Maniforld Relevance Determination model with the spike-and-slab prior
|
||||||
|
"""
|
||||||
|
|
||||||
|
from ..core import Model
|
||||||
|
from .ss_gplvm import SSGPLVM
|
||||||
|
|
||||||
|
class SSMRD(Model):
|
||||||
|
|
||||||
|
def __init__(self, Ylist, input_dim, X=None, X_variance=None,
|
||||||
|
initx = 'PCA', initz = 'permute',
|
||||||
|
num_inducing=10, Z=None, kernel=None,
|
||||||
|
inference_method=None, likelihoods=None, name='ss_mrd', Ynames=None):
|
||||||
|
super(SSMRD, self).__init__(name)
|
||||||
|
|
||||||
|
self.updates = False
|
||||||
|
self.models = [SSGPLVM(y, input_dim, X=X, X_variance=X_variance, num_inducing=num_inducing,Z=Z,init=initx,
|
||||||
|
kernel=kernel.copy() if kernel else None,inference_method=inference_method,likelihood=likelihoods,
|
||||||
|
name='model_'+str(i)) for i,y in enumerate(Ylist)]
|
||||||
|
self.add_parameters(*(self.models))
|
||||||
|
|
||||||
|
[[[self.models[m].X.mean[i,j:j+1].tie('mean_'+str(i)+'_'+str(j)) for m in xrange(len(self.models))] for j in xrange(self.models[0].X.mean.shape[1])]
|
||||||
|
for i in xrange(self.models[0].X.mean.shape[0])]
|
||||||
|
[[[self.models[m].X.variance[i,j:j+1].tie('var_'+str(i)+'_'+str(j)) for m in xrange(len(self.models))] for j in xrange(self.models[0].X.variance.shape[1])]
|
||||||
|
for i in xrange(self.models[0].X.variance.shape[0])]
|
||||||
|
|
||||||
|
self.updates = True
|
||||||
|
|
||||||
|
def parameters_changed(self):
|
||||||
|
super(SSMRD, self).parameters_changed()
|
||||||
|
self._log_marginal_likelihood = sum([m._log_marginal_likelihood for m in self.models])
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
return self._log_marginal_likelihood
|
||||||
|
|
@ -93,7 +93,7 @@ def plot_SpikeSlab(parameterized, fignum=None, ax=None, colors=None, side_by_sid
|
||||||
a.set_xticklabels('')
|
a.set_xticklabels('')
|
||||||
# binary prob plot
|
# binary prob plot
|
||||||
a = fig.add_subplot(*sub2)
|
a = fig.add_subplot(*sub2)
|
||||||
a.bar(x,gamma[:,i],bottom=0.,linewidth=0,align='center')
|
a.bar(x,gamma[:,i],bottom=0.,linewidth=0,width=1.0,align='center')
|
||||||
a.set_xlim(x.min(), x.max())
|
a.set_xlim(x.min(), x.max())
|
||||||
a.set_ylim([0.,1.])
|
a.set_ylim([0.,1.])
|
||||||
pb.draw()
|
pb.draw()
|
||||||
|
|
|
||||||
|
|
@ -88,7 +88,7 @@ class vector_show(matplotlib_show):
|
||||||
|
|
||||||
|
|
||||||
class lvm(matplotlib_show):
|
class lvm(matplotlib_show):
|
||||||
def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0,1]):
|
def __init__(self, vals, model, data_visualize, latent_axes=None, sense_axes=None, latent_index=[0,1], disable_drag=False):
|
||||||
"""Visualize a latent variable model
|
"""Visualize a latent variable model
|
||||||
|
|
||||||
:param model: the latent variable model to visualize.
|
:param model: the latent variable model to visualize.
|
||||||
|
|
@ -107,12 +107,14 @@ class lvm(matplotlib_show):
|
||||||
|
|
||||||
if isinstance(latent_axes,mpl.axes.Axes):
|
if isinstance(latent_axes,mpl.axes.Axes):
|
||||||
self.cid = latent_axes.figure.canvas.mpl_connect('button_press_event', self.on_click)
|
self.cid = latent_axes.figure.canvas.mpl_connect('button_press_event', self.on_click)
|
||||||
self.cid = latent_axes.figure.canvas.mpl_connect('motion_notify_event', self.on_move)
|
if not disable_drag:
|
||||||
|
self.cid = latent_axes.figure.canvas.mpl_connect('motion_notify_event', self.on_move)
|
||||||
self.cid = latent_axes.figure.canvas.mpl_connect('axes_leave_event', self.on_leave)
|
self.cid = latent_axes.figure.canvas.mpl_connect('axes_leave_event', self.on_leave)
|
||||||
self.cid = latent_axes.figure.canvas.mpl_connect('axes_enter_event', self.on_enter)
|
self.cid = latent_axes.figure.canvas.mpl_connect('axes_enter_event', self.on_enter)
|
||||||
else:
|
else:
|
||||||
self.cid = latent_axes[0].figure.canvas.mpl_connect('button_press_event', self.on_click)
|
self.cid = latent_axes[0].figure.canvas.mpl_connect('button_press_event', self.on_click)
|
||||||
self.cid = latent_axes[0].figure.canvas.mpl_connect('motion_notify_event', self.on_move)
|
if not disable_drag:
|
||||||
|
self.cid = latent_axes[0].figure.canvas.mpl_connect('motion_notify_event', self.on_move)
|
||||||
self.cid = latent_axes[0].figure.canvas.mpl_connect('axes_leave_event', self.on_leave)
|
self.cid = latent_axes[0].figure.canvas.mpl_connect('axes_leave_event', self.on_leave)
|
||||||
self.cid = latent_axes[0].figure.canvas.mpl_connect('axes_enter_event', self.on_enter)
|
self.cid = latent_axes[0].figure.canvas.mpl_connect('axes_enter_event', self.on_enter)
|
||||||
|
|
||||||
|
|
@ -124,6 +126,7 @@ class lvm(matplotlib_show):
|
||||||
self.move_on = False
|
self.move_on = False
|
||||||
self.latent_index = latent_index
|
self.latent_index = latent_index
|
||||||
self.latent_dim = model.input_dim
|
self.latent_dim = model.input_dim
|
||||||
|
self.disable_drag = disable_drag
|
||||||
|
|
||||||
# The red cross which shows current latent point.
|
# The red cross which shows current latent point.
|
||||||
self.latent_values = vals
|
self.latent_values = vals
|
||||||
|
|
@ -147,8 +150,13 @@ class lvm(matplotlib_show):
|
||||||
|
|
||||||
def on_click(self, event):
|
def on_click(self, event):
|
||||||
if event.inaxes!=self.latent_axes: return
|
if event.inaxes!=self.latent_axes: return
|
||||||
self.move_on = not self.move_on
|
if self.disable_drag:
|
||||||
self.called = True
|
self.move_on = True
|
||||||
|
self.called = True
|
||||||
|
self.on_move(event)
|
||||||
|
else:
|
||||||
|
self.move_on = not self.move_on
|
||||||
|
self.called = True
|
||||||
|
|
||||||
def on_move(self, event):
|
def on_move(self, event):
|
||||||
if event.inaxes!=self.latent_axes: return
|
if event.inaxes!=self.latent_axes: return
|
||||||
|
|
@ -274,8 +282,11 @@ class image_show(matplotlib_show):
|
||||||
:param preset_mean: the preset mean of a scaled image.
|
:param preset_mean: the preset mean of a scaled image.
|
||||||
:type preset_mean: double
|
:type preset_mean: double
|
||||||
:param preset_std: the preset standard deviation of a scaled image.
|
:param preset_std: the preset standard deviation of a scaled image.
|
||||||
:type preset_std: double"""
|
:type preset_std: double
|
||||||
def __init__(self, vals, axes=None, dimensions=(16,16), transpose=False, order='C', invert=False, scale=False, palette=[], preset_mean=0., preset_std=1., select_image=0):
|
:param cmap: the colormap for image visualization
|
||||||
|
:type cmap: matplotlib.cm"""
|
||||||
|
|
||||||
|
def __init__(self, vals, axes=None, dimensions=(16,16), transpose=False, order='C', invert=False, scale=False, palette=[], preset_mean=0., preset_std=1., select_image=0, cmap=None):
|
||||||
matplotlib_show.__init__(self, vals, axes)
|
matplotlib_show.__init__(self, vals, axes)
|
||||||
self.dimensions = dimensions
|
self.dimensions = dimensions
|
||||||
self.transpose = transpose
|
self.transpose = transpose
|
||||||
|
|
@ -290,8 +301,10 @@ class image_show(matplotlib_show):
|
||||||
self.set_image(self.vals)
|
self.set_image(self.vals)
|
||||||
if not self.palette == []: # Can just show the image (self.set_image() took care of setting the palette)
|
if not self.palette == []: # Can just show the image (self.set_image() took care of setting the palette)
|
||||||
self.handle = self.axes.imshow(self.vals, interpolation='nearest')
|
self.handle = self.axes.imshow(self.vals, interpolation='nearest')
|
||||||
else: # Use a boring gray map.
|
elif cmap==None: # Use a jet map.
|
||||||
self.handle = self.axes.imshow(self.vals, cmap=plt.cm.gray, interpolation='nearest') # @UndefinedVariable
|
self.handle = self.axes.imshow(self.vals, cmap=plt.cm.jet, interpolation='nearest') # @UndefinedVariable
|
||||||
|
else: # Use the selected map.
|
||||||
|
self.handle = self.axes.imshow(self.vals, cmap=cmap, interpolation='nearest') # @UndefinedVariable
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
def modify(self, vals):
|
def modify(self, vals):
|
||||||
|
|
@ -399,7 +412,7 @@ class mocap_data_show(matplotlib_show):
|
||||||
def __init__(self, vals, axes=None, connect=None):
|
def __init__(self, vals, axes=None, connect=None):
|
||||||
if axes==None:
|
if axes==None:
|
||||||
fig = plt.figure()
|
fig = plt.figure()
|
||||||
axes = fig.add_subplot(111, projection='3d',aspect='equal')
|
axes = fig.add_subplot(111, projection='3d', aspect='equal')
|
||||||
matplotlib_show.__init__(self, vals, axes)
|
matplotlib_show.__init__(self, vals, axes)
|
||||||
|
|
||||||
self.connect = connect
|
self.connect = connect
|
||||||
|
|
@ -437,6 +450,7 @@ class mocap_data_show(matplotlib_show):
|
||||||
self.process_values()
|
self.process_values()
|
||||||
self.initialize_axes_modify()
|
self.initialize_axes_modify()
|
||||||
self.draw_vertices()
|
self.draw_vertices()
|
||||||
|
self.initialize_axes()
|
||||||
self.finalize_axes_modify()
|
self.finalize_axes_modify()
|
||||||
self.draw_edges()
|
self.draw_edges()
|
||||||
self.axes.figure.canvas.draw()
|
self.axes.figure.canvas.draw()
|
||||||
|
|
@ -459,10 +473,10 @@ class mocap_data_show(matplotlib_show):
|
||||||
self.axes.set_xlim(self.x_lim)
|
self.axes.set_xlim(self.x_lim)
|
||||||
self.axes.set_ylim(self.y_lim)
|
self.axes.set_ylim(self.y_lim)
|
||||||
self.axes.set_zlim(self.z_lim)
|
self.axes.set_zlim(self.z_lim)
|
||||||
self.axes.auto_scale_xyz([-1., 1.], [-1., 1.], [-1.5, 1.5])
|
self.axes.auto_scale_xyz([-1., 1.], [-1., 1.], [-1., 1.])
|
||||||
|
|
||||||
#self.axes.set_aspect('equal')
|
# self.axes.set_aspect('equal')
|
||||||
self.axes.autoscale(enable=False)
|
# self.axes.autoscale(enable=False)
|
||||||
|
|
||||||
def finalize_axes_modify(self):
|
def finalize_axes_modify(self):
|
||||||
self.axes.set_xlim(self.x_lim)
|
self.axes.set_xlim(self.x_lim)
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,7 @@ import diag
|
||||||
import initialization
|
import initialization
|
||||||
import multioutput
|
import multioutput
|
||||||
import linalg_gpu
|
import linalg_gpu
|
||||||
|
import mpi
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import sympy
|
import sympy
|
||||||
|
|
|
||||||
|
|
@ -5,12 +5,44 @@ Global variables: initSuccess
|
||||||
providing CUBLAS handle: cublas_handle
|
providing CUBLAS handle: cublas_handle
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
gpu_initialized = False
|
||||||
|
gpu_device = None
|
||||||
|
gpu_context = None
|
||||||
|
MPI_enabled = False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
MPI_enabled = True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
if MPI_enabled and MPI.COMM_WORLD.size>1:
|
||||||
|
from .parallel import get_id_within_node
|
||||||
|
gpuid = get_id_within_node()
|
||||||
|
import pycuda.driver
|
||||||
|
pycuda.driver.init()
|
||||||
|
if gpuid>=pycuda.driver.Device.count():
|
||||||
|
print '['+MPI.Get_processor_name()+'] more processes than the GPU numbers!'
|
||||||
|
#MPI.COMM_WORLD.Abort()
|
||||||
|
raise
|
||||||
|
gpu_device = pycuda.driver.Device(gpuid)
|
||||||
|
gpu_context = gpu_device.make_context()
|
||||||
|
gpu_initialized = True
|
||||||
|
else:
|
||||||
|
import pycuda.autoinit
|
||||||
|
gpu_initialized = True
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
import pycuda.autoinit
|
|
||||||
from scikits.cuda import cublas
|
from scikits.cuda import cublas
|
||||||
import scikits.cuda.linalg as culinalg
|
import scikits.cuda.linalg as culinalg
|
||||||
culinalg.init()
|
culinalg.init()
|
||||||
cublas_handle = cublas.cublasCreate()
|
cublas_handle = cublas.cublasCreate()
|
||||||
initSuccess = True
|
|
||||||
except:
|
except:
|
||||||
initSuccess = False
|
pass
|
||||||
|
|
||||||
|
def closeGPU():
|
||||||
|
if gpu_context is not None:
|
||||||
|
gpu_context.detach()
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,9 @@ try:
|
||||||
|
|
||||||
strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step")
|
strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step")
|
||||||
|
|
||||||
|
# np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - a1 x a2, B - a2 x a1
|
||||||
|
traceDot = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%a1)*a2+i/a1]", arguments="double *A, double *B, int a1, int a2")
|
||||||
|
|
||||||
#=======================================================================================
|
#=======================================================================================
|
||||||
# Element-wise functions
|
# Element-wise functions
|
||||||
#=======================================================================================
|
#=======================================================================================
|
||||||
|
|
@ -57,3 +60,34 @@ try:
|
||||||
|
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
try:
|
||||||
|
import scikits.cuda.linalg as culinalg
|
||||||
|
from scikits.cuda import cublas
|
||||||
|
from scikits.cuda.cula import culaExceptions
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def jitchol(A, L, cublas_handle, maxtries=5):
|
||||||
|
try:
|
||||||
|
cublas.cublasDcopy(cublas_handle, A.size, A.gpudata, 1, L.gpudata, 1)
|
||||||
|
culinalg.cho_factor(L,'L')
|
||||||
|
except culaExceptions:
|
||||||
|
|
||||||
|
|
||||||
|
diagA = np.diag(A)
|
||||||
|
if np.any(diagA <= 0.):
|
||||||
|
raise linalg.LinAlgError, "not pd: non-positive diagonal elements"
|
||||||
|
jitter = diagA.mean() * 1e-6
|
||||||
|
while maxtries > 0 and np.isfinite(jitter):
|
||||||
|
print 'Warning: adding jitter of {:.10e}'.format(jitter)
|
||||||
|
try:
|
||||||
|
return linalg.cholesky(A + np.eye(A.shape[0]).T * jitter, lower=True)
|
||||||
|
except:
|
||||||
|
jitter *= 10
|
||||||
|
finally:
|
||||||
|
maxtries -= 1
|
||||||
|
raise linalg.LinAlgError, "not positive definite, even with jitter."
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
35
GPy/util/mpi.py
Normal file
35
GPy/util/mpi.py
Normal file
|
|
@ -0,0 +1,35 @@
|
||||||
|
"""
|
||||||
|
The tools for mpi
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
import numpy as np
|
||||||
|
from mpi4py import MPI
|
||||||
|
numpy_to_MPI_typemap = {
|
||||||
|
np.dtype(np.float64) : MPI.DOUBLE,
|
||||||
|
np.dtype(np.float32) : MPI.FLOAT,
|
||||||
|
np.dtype(np.int) : MPI.INT,
|
||||||
|
np.dtype(np.int8) : MPI.CHAR,
|
||||||
|
np.dtype(np.uint8) : MPI.UNSIGNED_CHAR,
|
||||||
|
np.dtype(np.int32) : MPI.INT,
|
||||||
|
np.dtype(np.uint32) : MPI.UNSIGNED_INT,
|
||||||
|
}
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def divide_data(datanum, comm):
|
||||||
|
|
||||||
|
residue = (datanum)%comm.size
|
||||||
|
datanum_list = np.empty((comm.size),dtype=np.int32)
|
||||||
|
for i in xrange(comm.size):
|
||||||
|
if i<residue:
|
||||||
|
datanum_list[i] = int(datanum/comm.size)+1
|
||||||
|
else:
|
||||||
|
datanum_list[i] = int(datanum/comm.size)
|
||||||
|
if comm.rank<residue:
|
||||||
|
size = datanum/comm.size+1
|
||||||
|
offset = size*comm.rank
|
||||||
|
else:
|
||||||
|
size = datanum/comm.size
|
||||||
|
offset = size*comm.rank+residue
|
||||||
|
return offset, offset+size, datanum_list
|
||||||
14
GPy/util/parallel.py
Normal file
14
GPy/util/parallel.py
Normal file
|
|
@ -0,0 +1,14 @@
|
||||||
|
"""
|
||||||
|
The module of tools for parallelization (MPI)
|
||||||
|
"""
|
||||||
|
|
||||||
|
try:
|
||||||
|
from mpi4py import MPI
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_id_within_node(comm=MPI.COMM_WORLD):
|
||||||
|
rank = comm.rank
|
||||||
|
nodename = MPI.Get_processor_name()
|
||||||
|
nodelist = comm.allgather(nodename)
|
||||||
|
return len([i for i in nodelist[:rank] if i==nodename])
|
||||||
Loading…
Add table
Add a link
Reference in a new issue