mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-07 02:52:40 +02:00
Merge branch 'mean_functions' into devel
This commit is contained in:
commit
8c71d52b7f
27 changed files with 461 additions and 137 deletions
|
|
@ -5,6 +5,7 @@ import numpy as np
|
||||||
import sys
|
import sys
|
||||||
from .. import kern
|
from .. import kern
|
||||||
from model import Model
|
from model import Model
|
||||||
|
from mapping import Mapping
|
||||||
from parameterization import ObsAr
|
from parameterization import ObsAr
|
||||||
from .. import likelihoods
|
from .. import likelihoods
|
||||||
from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
|
from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
|
||||||
|
|
@ -34,7 +35,7 @@ class GP(Model):
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
|
def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
|
||||||
super(GP, self).__init__(name)
|
super(GP, self).__init__(name)
|
||||||
|
|
||||||
assert X.ndim == 2
|
assert X.ndim == 2
|
||||||
|
|
@ -75,6 +76,15 @@ class GP(Model):
|
||||||
assert isinstance(likelihood, likelihoods.Likelihood)
|
assert isinstance(likelihood, likelihoods.Likelihood)
|
||||||
self.likelihood = likelihood
|
self.likelihood = likelihood
|
||||||
|
|
||||||
|
#handle the mean function
|
||||||
|
self.mean_function = mean_function
|
||||||
|
if mean_function is not None:
|
||||||
|
assert isinstance(self.mean_function, Mapping)
|
||||||
|
assert mean_function.input_dim == self.input_dim
|
||||||
|
assert mean_function.output_dim == self.output_dim
|
||||||
|
self.link_parameter(mean_function)
|
||||||
|
|
||||||
|
|
||||||
#find a sensible inference method
|
#find a sensible inference method
|
||||||
logger.info("initializing inference method")
|
logger.info("initializing inference method")
|
||||||
if inference_method is None:
|
if inference_method is None:
|
||||||
|
|
@ -153,9 +163,11 @@ class GP(Model):
|
||||||
This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
|
This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
|
||||||
this method yourself, there may be unexpected consequences.
|
this method yourself, there may be unexpected consequences.
|
||||||
"""
|
"""
|
||||||
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata)
|
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
|
||||||
self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
|
self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
|
||||||
self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
|
self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
|
||||||
|
if self.mean_function is not None:
|
||||||
|
self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)
|
||||||
|
|
||||||
def log_likelihood(self):
|
def log_likelihood(self):
|
||||||
"""
|
"""
|
||||||
|
|
@ -192,6 +204,10 @@ class GP(Model):
|
||||||
|
|
||||||
#force mu to be a column vector
|
#force mu to be a column vector
|
||||||
if len(mu.shape)==1: mu = mu[:,None]
|
if len(mu.shape)==1: mu = mu[:,None]
|
||||||
|
|
||||||
|
#add the mean function in
|
||||||
|
if not self.mean_function is None:
|
||||||
|
mu += self.mean_function.f(_Xnew)
|
||||||
return mu, var
|
return mu, var
|
||||||
|
|
||||||
def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
|
def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@ class SparseGP(GP):
|
||||||
This model allows (approximate) inference using variational DTC or FITC
|
This model allows (approximate) inference using variational DTC or FITC
|
||||||
(Gaussian likelihoods) as well as non-conjugate sparse methods based on
|
(Gaussian likelihoods) as well as non-conjugate sparse methods based on
|
||||||
these.
|
these.
|
||||||
|
|
||||||
This is not for missing data, as the implementation for missing data involves
|
This is not for missing data, as the implementation for missing data involves
|
||||||
some inefficient optimization routine decisions.
|
some inefficient optimization routine decisions.
|
||||||
See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
|
See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
|
||||||
|
|
@ -39,7 +39,7 @@ class SparseGP(GP):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
|
def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
|
||||||
name='sparse gp', Y_metadata=None, normalizer=False):
|
name='sparse gp', Y_metadata=None, normalizer=False):
|
||||||
#pick a sensible inference method
|
#pick a sensible inference method
|
||||||
if inference_method is None:
|
if inference_method is None:
|
||||||
|
|
@ -53,7 +53,7 @@ class SparseGP(GP):
|
||||||
self.Z = Param('inducing inputs', Z)
|
self.Z = Param('inducing inputs', Z)
|
||||||
self.num_inducing = Z.shape[0]
|
self.num_inducing = Z.shape[0]
|
||||||
|
|
||||||
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
|
GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
|
||||||
|
|
||||||
logger.info("Adding Z as parameter")
|
logger.info("Adding Z as parameter")
|
||||||
self.link_parameter(self.Z, index=0)
|
self.link_parameter(self.Z, index=0)
|
||||||
|
|
@ -61,7 +61,7 @@ class SparseGP(GP):
|
||||||
|
|
||||||
def has_uncertain_inputs(self):
|
def has_uncertain_inputs(self):
|
||||||
return isinstance(self.X, VariationalPosterior)
|
return isinstance(self.X, VariationalPosterior)
|
||||||
|
|
||||||
def set_Z(self, Z, trigger_update=True):
|
def set_Z(self, Z, trigger_update=True):
|
||||||
if trigger_update: self.update_model(False)
|
if trigger_update: self.update_model(False)
|
||||||
self.unlink_parameter(self.Z)
|
self.unlink_parameter(self.Z)
|
||||||
|
|
@ -110,8 +110,8 @@ class SparseGP(GP):
|
||||||
|
|
||||||
def _raw_predict(self, Xnew, full_cov=False, kern=None):
|
def _raw_predict(self, Xnew, full_cov=False, kern=None):
|
||||||
"""
|
"""
|
||||||
Make a prediction for the latent function values.
|
Make a prediction for the latent function values.
|
||||||
|
|
||||||
For certain inputs we give back a full_cov of shape NxN,
|
For certain inputs we give back a full_cov of shape NxN,
|
||||||
if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
|
if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
|
||||||
we take only the diagonal elements across N.
|
we take only the diagonal elements across N.
|
||||||
|
|
@ -136,6 +136,9 @@ class SparseGP(GP):
|
||||||
else:
|
else:
|
||||||
Kxx = kern.Kdiag(Xnew)
|
Kxx = kern.Kdiag(Xnew)
|
||||||
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
|
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
|
||||||
|
#add in the mean function
|
||||||
|
if self.mean_function is not None:
|
||||||
|
mu += self.mean_function.f(Xnew)
|
||||||
else:
|
else:
|
||||||
psi0_star = self.kern.psi0(self.Z, Xnew)
|
psi0_star = self.kern.psi0(self.Z, Xnew)
|
||||||
psi1_star = self.kern.psi1(self.Z, Xnew)
|
psi1_star = self.kern.psi1(self.Z, Xnew)
|
||||||
|
|
@ -165,4 +168,5 @@ class SparseGP(GP):
|
||||||
var[i] = var_
|
var[i] = var_
|
||||||
else:
|
else:
|
||||||
var[i] = np.diag(var_)+p0-t2
|
var[i] = np.diag(var_)+p0-t2
|
||||||
|
|
||||||
return mu, var
|
return mu, var
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf
|
||||||
|
|
||||||
|
|
||||||
class SVGP(SparseGP):
|
class SVGP(SparseGP):
|
||||||
def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None):
|
def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
|
||||||
"""
|
"""
|
||||||
Stochastic Variational GP.
|
Stochastic Variational GP.
|
||||||
|
|
||||||
|
|
@ -38,7 +38,7 @@ class SVGP(SparseGP):
|
||||||
#create the SVI inference method
|
#create the SVI inference method
|
||||||
inf_method = svgp_inf()
|
inf_method = svgp_inf()
|
||||||
|
|
||||||
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method,
|
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
|
||||||
name=name, Y_metadata=Y_metadata, normalizer=False)
|
name=name, Y_metadata=Y_metadata, normalizer=False)
|
||||||
|
|
||||||
self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
|
self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
|
||||||
|
|
@ -48,7 +48,7 @@ class SVGP(SparseGP):
|
||||||
self.link_parameter(self.m)
|
self.link_parameter(self.m)
|
||||||
|
|
||||||
def parameters_changed(self):
|
def parameters_changed(self):
|
||||||
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
|
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
|
||||||
|
|
||||||
#update the kernel gradients
|
#update the kernel gradients
|
||||||
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
|
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
|
||||||
|
|
@ -65,6 +65,13 @@ class SVGP(SparseGP):
|
||||||
self.m.gradient = self.grad_dict['dL_dm']
|
self.m.gradient = self.grad_dict['dL_dm']
|
||||||
self.chol.gradient = self.grad_dict['dL_dchol']
|
self.chol.gradient = self.grad_dict['dL_dchol']
|
||||||
|
|
||||||
|
if self.mean_function is not None:
|
||||||
|
self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
|
||||||
|
g = self.mean_function.gradient[:].copy()
|
||||||
|
self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
|
||||||
|
self.mean_function.gradient[:] += g
|
||||||
|
self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
|
||||||
|
|
||||||
def set_data(self, X, Y):
|
def set_data(self, X, Y):
|
||||||
"""
|
"""
|
||||||
Set the data without calling parameters_changed to avoid wasted computation
|
Set the data without calling parameters_changed to avoid wasted computation
|
||||||
|
|
|
||||||
|
|
@ -505,3 +505,48 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
|
||||||
|
|
||||||
print m
|
print m
|
||||||
return m
|
return m
|
||||||
|
|
||||||
|
def simple_mean_function(max_iters=100, optimize=True, plot=True):
|
||||||
|
"""
|
||||||
|
The simplest possible mean function. No parameters, just a simple Sinusoid.
|
||||||
|
"""
|
||||||
|
#create simple mean function
|
||||||
|
mf = GPy.core.Mapping(1,1)
|
||||||
|
mf.f = np.sin
|
||||||
|
mf.update_gradients = lambda a,b: None
|
||||||
|
|
||||||
|
X = np.linspace(0,10,50).reshape(-1,1)
|
||||||
|
Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
|
||||||
|
|
||||||
|
k =GPy.kern.RBF(1)
|
||||||
|
lik = GPy.likelihoods.Gaussian()
|
||||||
|
m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
|
||||||
|
if optimize:
|
||||||
|
m.optimize(max_iters=max_iters)
|
||||||
|
if plot:
|
||||||
|
m.plot(plot_limits=(-10,15))
|
||||||
|
return m
|
||||||
|
|
||||||
|
def parametric_mean_function(max_iters=100, optimize=True, plot=True):
|
||||||
|
"""
|
||||||
|
A linear mean function with parameters that we'll learn alongside the kernel
|
||||||
|
"""
|
||||||
|
#create simple mean function
|
||||||
|
mf = GPy.core.Mapping(1,1)
|
||||||
|
mf.f = np.sin
|
||||||
|
|
||||||
|
X = np.linspace(0,10,50).reshape(-1,1)
|
||||||
|
Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
|
||||||
|
|
||||||
|
mf = GPy.mappings.Linear(1,1)
|
||||||
|
|
||||||
|
k =GPy.kern.RBF(1)
|
||||||
|
lik = GPy.likelihoods.Gaussian()
|
||||||
|
m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
|
||||||
|
if optimize:
|
||||||
|
m.optimize(max_iters=max_iters)
|
||||||
|
if plot:
|
||||||
|
m.plot()
|
||||||
|
return m
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -20,7 +20,8 @@ class DTC(LatentFunctionInference):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.const_jitter = 1e-6
|
self.const_jitter = 1e-6
|
||||||
|
|
||||||
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
|
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
|
||||||
|
assert mean_function is None, "inference with a mean function not implemented"
|
||||||
assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
|
assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
|
||||||
|
|
||||||
num_inducing, _ = Z.shape
|
num_inducing, _ = Z.shape
|
||||||
|
|
@ -88,7 +89,8 @@ class vDTC(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.const_jitter = 1e-6
|
self.const_jitter = 1e-6
|
||||||
|
|
||||||
def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata):
|
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
|
||||||
|
assert mean_function is None, "inference with a mean function not implemented"
|
||||||
assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
|
assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
|
||||||
|
|
||||||
num_inducing, _ = Z.shape
|
num_inducing, _ = Z.shape
|
||||||
|
|
|
||||||
|
|
@ -36,11 +36,18 @@ class ExactGaussianInference(LatentFunctionInference):
|
||||||
#print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
|
#print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
|
||||||
return Y
|
return Y
|
||||||
|
|
||||||
def inference(self, kern, X, likelihood, Y, Y_metadata=None):
|
def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
|
||||||
"""
|
"""
|
||||||
Returns a Posterior class containing essential quantities of the posterior
|
Returns a Posterior class containing essential quantities of the posterior
|
||||||
"""
|
"""
|
||||||
YYT_factor = self.get_YYTfactor(Y)
|
|
||||||
|
if mean_function is None:
|
||||||
|
m = 0
|
||||||
|
else:
|
||||||
|
m = mean_function.f(X)
|
||||||
|
|
||||||
|
|
||||||
|
YYT_factor = self.get_YYTfactor(Y-m)
|
||||||
|
|
||||||
K = kern.K(X)
|
K = kern.K(X)
|
||||||
|
|
||||||
|
|
@ -56,4 +63,4 @@ class ExactGaussianInference(LatentFunctionInference):
|
||||||
|
|
||||||
dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
|
dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
|
||||||
|
|
||||||
return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL}
|
return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,8 @@ class EP(LatentFunctionInference):
|
||||||
# TODO: update approximation in the end as well? Maybe even with a switch?
|
# TODO: update approximation in the end as well? Maybe even with a switch?
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None):
|
def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
|
||||||
|
assert mean_function is None, "inference with a mean function not implemented"
|
||||||
num_data, output_dim = Y.shape
|
num_data, output_dim = Y.shape
|
||||||
assert output_dim ==1, "ep in 1D only (for now!)"
|
assert output_dim ==1, "ep in 1D only (for now!)"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -64,7 +64,8 @@ class EPDTC(LatentFunctionInference):
|
||||||
self.old_mutilde, self.old_vtilde = None, None
|
self.old_mutilde, self.old_vtilde = None, None
|
||||||
self._ep_approximation = None
|
self._ep_approximation = None
|
||||||
|
|
||||||
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
|
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
|
||||||
|
assert mean_function is None, "inference with a mean function not implemented"
|
||||||
num_data, output_dim = Y.shape
|
num_data, output_dim = Y.shape
|
||||||
assert output_dim ==1, "ep in 1D only (for now!)"
|
assert output_dim ==1, "ep in 1D only (for now!)"
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,8 @@ class FITC(LatentFunctionInference):
|
||||||
"""
|
"""
|
||||||
const_jitter = 1e-6
|
const_jitter = 1e-6
|
||||||
|
|
||||||
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None):
|
def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
|
||||||
|
assert mean_function is None, "inference with a mean function not implemented"
|
||||||
|
|
||||||
num_inducing, _ = Z.shape
|
num_inducing, _ = Z.shape
|
||||||
num_data, output_dim = Y.shape
|
num_data, output_dim = Y.shape
|
||||||
|
|
|
||||||
|
|
@ -39,10 +39,12 @@ class Laplace(LatentFunctionInference):
|
||||||
self.first_run = True
|
self.first_run = True
|
||||||
self._previous_Ki_fhat = None
|
self._previous_Ki_fhat = None
|
||||||
|
|
||||||
def inference(self, kern, X, likelihood, Y, Y_metadata=None):
|
def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
|
||||||
"""
|
"""
|
||||||
Returns a Posterior class containing essential quantities of the posterior
|
Returns a Posterior class containing essential quantities of the posterior
|
||||||
"""
|
"""
|
||||||
|
assert mean_function is None, "inference with a mean function not implemented"
|
||||||
|
|
||||||
# Compute K
|
# Compute K
|
||||||
K = kern.K(X)
|
K = kern.K(X)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ class Posterior(object):
|
||||||
the function at any new point x_* by integrating over this posterior.
|
the function at any new point x_* by integrating over this posterior.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None):
|
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
|
||||||
"""
|
"""
|
||||||
woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
|
woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
|
||||||
woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
|
woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
|
||||||
|
|
@ -67,6 +67,7 @@ class Posterior(object):
|
||||||
#option 2:
|
#option 2:
|
||||||
self._mean = mean
|
self._mean = mean
|
||||||
self._covariance = cov
|
self._covariance = cov
|
||||||
|
self._prior_mean = prior_mean
|
||||||
|
|
||||||
#compute this lazily
|
#compute this lazily
|
||||||
self._precision = None
|
self._precision = None
|
||||||
|
|
@ -175,7 +176,7 @@ class Posterior(object):
|
||||||
$$
|
$$
|
||||||
"""
|
"""
|
||||||
if self._woodbury_vector is None:
|
if self._woodbury_vector is None:
|
||||||
self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean)
|
self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
|
||||||
return self._woodbury_vector
|
return self._woodbury_vector
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,8 @@ from posterior import Posterior
|
||||||
|
|
||||||
class SVGP(LatentFunctionInference):
|
class SVGP(LatentFunctionInference):
|
||||||
|
|
||||||
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
|
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
|
||||||
|
|
||||||
num_inducing = Z.shape[0]
|
num_inducing = Z.shape[0]
|
||||||
num_data, num_outputs = Y.shape
|
num_data, num_outputs = Y.shape
|
||||||
|
|
||||||
|
|
@ -22,6 +23,15 @@ class SVGP(LatentFunctionInference):
|
||||||
#S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
|
#S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
|
||||||
#Si, Lnew, _,_ = linalg.pdinv(S)
|
#Si, Lnew, _,_ = linalg.pdinv(S)
|
||||||
|
|
||||||
|
#compute mean function stuff
|
||||||
|
if mean_function is not None:
|
||||||
|
prior_mean_u = mean_function.f(Z)
|
||||||
|
prior_mean_f = mean_function.f(X)
|
||||||
|
else:
|
||||||
|
prior_mean_u = np.zeros((num_inducing, num_outputs))
|
||||||
|
prior_mean_f = np.zeros((num_data, num_outputs))
|
||||||
|
|
||||||
|
|
||||||
#compute kernel related stuff
|
#compute kernel related stuff
|
||||||
Kmm = kern.K(Z)
|
Kmm = kern.K(Z)
|
||||||
Knm = kern.K(X, Z)
|
Knm = kern.K(X, Z)
|
||||||
|
|
@ -30,17 +40,31 @@ class SVGP(LatentFunctionInference):
|
||||||
|
|
||||||
#compute the marginal means and variances of q(f)
|
#compute the marginal means and variances of q(f)
|
||||||
A = np.dot(Knm, Kmmi)
|
A = np.dot(Knm, Kmmi)
|
||||||
mu = np.dot(A, q_u_mean)
|
mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
|
||||||
v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
|
v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
|
||||||
|
|
||||||
#compute the KL term
|
#compute the KL term
|
||||||
Kmmim = np.dot(Kmmi, q_u_mean)
|
Kmmim = np.dot(Kmmi, q_u_mean)
|
||||||
KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
|
KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
|
||||||
KL = KLs.sum()
|
KL = KLs.sum()
|
||||||
dKL_dm = Kmmim
|
#gradient of the KL term (assuming zero mean function)
|
||||||
|
dKL_dm = Kmmim.copy()
|
||||||
dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
|
dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
|
||||||
dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
|
dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
|
||||||
|
|
||||||
|
if mean_function is not None:
|
||||||
|
#adjust KL term for mean function
|
||||||
|
Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
|
||||||
|
KL += -np.sum(q_u_mean*Kmmi_mfZ)
|
||||||
|
KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
|
||||||
|
|
||||||
|
#adjust gradient for mean fucntion
|
||||||
|
dKL_dm -= Kmmi_mfZ
|
||||||
|
dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
|
||||||
|
dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
|
||||||
|
|
||||||
|
#compute gradients for mean_function
|
||||||
|
dKL_dmfZ = Kmmi_mfZ - Kmmim
|
||||||
|
|
||||||
#quadrature for the likelihood
|
#quadrature for the likelihood
|
||||||
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
|
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
|
||||||
|
|
@ -50,11 +74,9 @@ class SVGP(LatentFunctionInference):
|
||||||
if dF_dthetaL is not None:
|
if dF_dthetaL is not None:
|
||||||
dF_dthetaL = dF_dthetaL.sum(1).sum(1)*batch_scale
|
dF_dthetaL = dF_dthetaL.sum(1).sum(1)*batch_scale
|
||||||
|
|
||||||
#derivatives of expected likelihood
|
#derivatives of expected likelihood, assuming zero mean function
|
||||||
Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
|
Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
|
||||||
Admu = A.T.dot(dF_dmu)
|
Admu = A.T.dot(dF_dmu)
|
||||||
#AdvA = np.einsum('ijk,jl->ilk', Adv, A)
|
|
||||||
#AdvA = np.dot(A.T, Adv).swapaxes(0,1)
|
|
||||||
AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
|
AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
|
||||||
tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
|
tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
|
||||||
dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
|
dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
|
||||||
|
|
@ -64,6 +86,14 @@ class SVGP(LatentFunctionInference):
|
||||||
dF_dm = Admu
|
dF_dm = Admu
|
||||||
dF_dS = AdvA
|
dF_dS = AdvA
|
||||||
|
|
||||||
|
#adjust gradient to account for mean function
|
||||||
|
if mean_function is not None:
|
||||||
|
dF_dmfX = dF_dmu.copy()
|
||||||
|
dF_dmfZ = -Admu
|
||||||
|
dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
|
||||||
|
dF_dKmm += Admu.dot(Kmmi_mfZ.T)
|
||||||
|
|
||||||
|
|
||||||
#sum (gradients of) expected likelihood and KL part
|
#sum (gradients of) expected likelihood and KL part
|
||||||
log_marginal = F.sum() - KL
|
log_marginal = F.sum() - KL
|
||||||
dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
|
dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
|
||||||
|
|
@ -71,4 +101,8 @@ class SVGP(LatentFunctionInference):
|
||||||
dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
|
dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
|
||||||
dL_dchol = choleskies.triang_to_flat(dL_dchol)
|
dL_dchol = choleskies.triang_to_flat(dL_dchol)
|
||||||
|
|
||||||
return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
|
grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
|
||||||
|
if mean_function is not None:
|
||||||
|
grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
|
||||||
|
grad_dict['dL_dmfX'] = dF_dmfX
|
||||||
|
return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,20 @@ from kern import CombinationKernel
|
||||||
from ...util.caching import Cache_this
|
from ...util.caching import Cache_this
|
||||||
import itertools
|
import itertools
|
||||||
|
|
||||||
|
|
||||||
|
def numpy_invalid_op_as_exception(func):
|
||||||
|
"""
|
||||||
|
A decorator that allows catching numpy invalid operations
|
||||||
|
as exceptions (the default behaviour is raising warnings).
|
||||||
|
"""
|
||||||
|
def func_wrapper(*args, **kwargs):
|
||||||
|
np.seterr(invalid='raise')
|
||||||
|
result = func(*args, **kwargs)
|
||||||
|
np.seterr(invalid='warn')
|
||||||
|
return result
|
||||||
|
return func_wrapper
|
||||||
|
|
||||||
|
|
||||||
class Prod(CombinationKernel):
|
class Prod(CombinationKernel):
|
||||||
"""
|
"""
|
||||||
Computes the product of 2 kernels
|
Computes the product of 2 kernels
|
||||||
|
|
@ -46,18 +60,20 @@ class Prod(CombinationKernel):
|
||||||
self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
|
self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
|
||||||
self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
|
self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
|
||||||
else:
|
else:
|
||||||
k = self.K(X,X2)*dL_dK
|
for combination in itertools.combinations(self.parts, len(self.parts) - 1):
|
||||||
for p in self.parts:
|
prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
|
||||||
p.update_gradients_full(k/p.K(X,X2),X,X2)
|
to_update = list(set(self.parts) - set(combination))[0]
|
||||||
|
to_update.update_gradients_full(dL_dK * prod, X, X2)
|
||||||
|
|
||||||
def update_gradients_diag(self, dL_dKdiag, X):
|
def update_gradients_diag(self, dL_dKdiag, X):
|
||||||
if len(self.parts)==2:
|
if len(self.parts)==2:
|
||||||
self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
|
self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
|
||||||
self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
|
self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
|
||||||
else:
|
else:
|
||||||
k = self.Kdiag(X)*dL_dKdiag
|
for combination in itertools.combinations(self.parts, len(self.parts) - 1):
|
||||||
for p in self.parts:
|
prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
|
||||||
p.update_gradients_diag(k/p.Kdiag(X),X)
|
to_update = list(set(self.parts) - set(combination))[0]
|
||||||
|
to_update.update_gradients_diag(dL_dKdiag * prod, X)
|
||||||
|
|
||||||
def gradients_X(self, dL_dK, X, X2=None):
|
def gradients_X(self, dL_dK, X, X2=None):
|
||||||
target = np.zeros(X.shape)
|
target = np.zeros(X.shape)
|
||||||
|
|
@ -65,9 +81,10 @@ class Prod(CombinationKernel):
|
||||||
target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
|
target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
|
||||||
target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
|
target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
|
||||||
else:
|
else:
|
||||||
k = self.K(X,X2)*dL_dK
|
for combination in itertools.combinations(self.parts, len(self.parts) - 1):
|
||||||
for p in self.parts:
|
prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
|
||||||
target += p.gradients_X(k/p.K(X,X2),X,X2)
|
to_update = list(set(self.parts) - set(combination))[0]
|
||||||
|
target += to_update.gradients_X(dL_dK * prod, X, X2)
|
||||||
return target
|
return target
|
||||||
|
|
||||||
def gradients_X_diag(self, dL_dKdiag, X):
|
def gradients_X_diag(self, dL_dKdiag, X):
|
||||||
|
|
@ -80,3 +97,5 @@ class Prod(CombinationKernel):
|
||||||
for p in self.parts:
|
for p in self.parts:
|
||||||
target += p.gradients_X_diag(k/p.Kdiag(X),X)
|
target += p.gradients_X_diag(k/p.Kdiag(X),X)
|
||||||
return target
|
return target
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,4 +4,5 @@
|
||||||
from kernel import Kernel
|
from kernel import Kernel
|
||||||
from linear import Linear
|
from linear import Linear
|
||||||
from mlp import MLP
|
from mlp import MLP
|
||||||
#from rbf import RBF
|
from additive import Additive
|
||||||
|
from compound import Compound
|
||||||
|
|
|
||||||
|
|
@ -2,8 +2,7 @@
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from ..core.mapping import Mapping
|
from ..core import Mapping
|
||||||
import GPy
|
|
||||||
|
|
||||||
class Additive(Mapping):
|
class Additive(Mapping):
|
||||||
"""
|
"""
|
||||||
|
|
@ -27,8 +26,6 @@ class Additive(Mapping):
|
||||||
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
|
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
|
||||||
self.mapping1 = mapping1
|
self.mapping1 = mapping1
|
||||||
self.mapping2 = mapping2
|
self.mapping2 = mapping2
|
||||||
self.num_params = self.mapping1.num_params + self.mapping2.num_params
|
|
||||||
self.name = self.mapping1.name + '+' + self.mapping2.name
|
|
||||||
|
|
||||||
def f(self, X):
|
def f(self, X):
|
||||||
return self.mapping1.f(X) + self.mapping2.f(X)
|
return self.mapping1.f(X) + self.mapping2.f(X)
|
||||||
|
|
|
||||||
39
GPy/mappings/compound.py
Normal file
39
GPy/mappings/compound.py
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
# Copyright (c) 2015, James Hensman and Alan Saul
|
||||||
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
from ..core import Mapping
|
||||||
|
|
||||||
|
class Compound(Mapping):
|
||||||
|
"""
|
||||||
|
Mapping based on passing one mapping through another
|
||||||
|
|
||||||
|
.. math::
|
||||||
|
|
||||||
|
f(\mathbf{x}) = f_2(f_1(\mathbf{x}))
|
||||||
|
|
||||||
|
:param mapping1: first mapping
|
||||||
|
:type mapping1: GPy.mappings.Mapping
|
||||||
|
:param mapping2: second mapping
|
||||||
|
:type mapping2: GPy.mappings.Mapping
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, mapping1, mapping2):
|
||||||
|
assert(mapping1.output_dim==mapping2.input_dim)
|
||||||
|
input_dim, output_dim = mapping1.input_dim, mapping2.output_dim
|
||||||
|
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
|
||||||
|
self.mapping1 = mapping1
|
||||||
|
self.mapping2 = mapping2
|
||||||
|
self.link_parameters(self.mapping1, self.mapping2)
|
||||||
|
|
||||||
|
def f(self, X):
|
||||||
|
return self.mapping2.f(self.mapping1.f(X))
|
||||||
|
|
||||||
|
def update_gradients(self, dL_dF, X):
|
||||||
|
hidden = self.mapping1.f(X)
|
||||||
|
self.mapping2.update_gradients(dL_dF, hidden)
|
||||||
|
self.mapping1.update_gradients(self.mapping2.gradients_X(dL_dF, hidden), X)
|
||||||
|
|
||||||
|
def gradients_X(self, dL_dF, X):
|
||||||
|
hidden = self.mapping1.f(X)
|
||||||
|
return self.mapping1.gradients_X(self.mapping2.gradients_X(dL_dF, hidden), X)
|
||||||
|
|
@ -36,16 +36,16 @@ class Kernel(Mapping):
|
||||||
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
|
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
|
||||||
self.kern = kernel
|
self.kern = kernel
|
||||||
self.Z = Z
|
self.Z = Z
|
||||||
self.num_bases, Zdim = X.shape
|
self.num_bases, Zdim = Z.shape
|
||||||
assert Zdim == self.input_dim
|
assert Zdim == self.input_dim
|
||||||
self.A = GPy.core.Param('A', np.random.randn(self.num_bases, self.output_dim))
|
self.A = Param('A', np.random.randn(self.num_bases, self.output_dim))
|
||||||
self.add_parameter(self.A)
|
self.link_parameter(self.A)
|
||||||
|
|
||||||
def f(self, X):
|
def f(self, X):
|
||||||
return np.dot(self.kern.K(X, self.Z), self.A)
|
return np.dot(self.kern.K(X, self.Z), self.A)
|
||||||
|
|
||||||
def update_gradients(self, dL_dF, X):
|
def update_gradients(self, dL_dF, X):
|
||||||
self.kern.update_gradients_full(np.dot(dL_dF, self.A.T))
|
self.kern.update_gradients_full(np.dot(dL_dF, self.A.T), X, self.Z)
|
||||||
self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)
|
self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)
|
||||||
|
|
||||||
def gradients_X(self, dL_dF, X):
|
def gradients_X(self, dL_dF, X):
|
||||||
|
|
|
||||||
|
|
@ -26,8 +26,8 @@ class Linear(Mapping):
|
||||||
|
|
||||||
def __init__(self, input_dim, output_dim, name='linmap'):
|
def __init__(self, input_dim, output_dim, name='linmap'):
|
||||||
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
|
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
|
||||||
self.A = GPy.core.Param('A', np.random.randn(self.input_dim, self.output_dim))
|
self.A = Param('A', np.random.randn(self.input_dim, self.output_dim))
|
||||||
self.add_parameter(self.A)
|
self.link_parameter(self.A)
|
||||||
|
|
||||||
def f(self, X):
|
def f(self, X):
|
||||||
return np.dot(X, self.A)
|
return np.dot(X, self.A)
|
||||||
|
|
|
||||||
|
|
@ -11,32 +11,45 @@ class MLP(Mapping):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
|
def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
|
||||||
super(MLP).__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
|
super(MLP, self).__init__(input_dim=input_dim, output_dim=output_dim, name=name)
|
||||||
self.hidden_dim = hidden_dim
|
self.hidden_dim = hidden_dim
|
||||||
self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
|
self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
|
||||||
self.b1 = Param('b1', np.random.randn(self.hidden_dim))
|
self.b1 = Param('b1', np.random.randn(self.hidden_dim))
|
||||||
self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
|
self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
|
||||||
self.b2 = Param('b2', np.random.randn(self.output_dim))
|
self.b2 = Param('b2', np.random.randn(self.output_dim))
|
||||||
|
self.link_parameters(self.W1, self.b1, self.W2, self.b2)
|
||||||
|
|
||||||
|
|
||||||
def f(self, X):
|
def f(self, X):
|
||||||
N, D = X.shape
|
layer1 = np.dot(X, self.W1) + self.b1
|
||||||
activations = np.tanh(np.dot(X,self.W1) + self.b1)
|
activations = np.tanh(layer1)
|
||||||
self.out = np.dot(self.activations,self.W2) + self.b2
|
return np.dot(activations, self.W2) + self.b2
|
||||||
return self.output_fn(self.out)
|
|
||||||
|
|
||||||
def update_gradients(self, dL_dF, X):
|
def update_gradients(self, dL_dF, X):
|
||||||
activations = np.tanh(np.dot(X,self.W1) + self.b1)
|
layer1 = np.dot(X,self.W1) + self.b1
|
||||||
|
activations = np.tanh(layer1)
|
||||||
|
|
||||||
#Evaluate second-layer gradients.
|
#Evaluate second-layer gradients.
|
||||||
self.W2.gradient = np.dot(activations.T, dL_dF)
|
self.W2.gradient = np.dot(activations.T, dL_dF)
|
||||||
self.b2.gradient = np.sum(dL_dF, 0)
|
self.b2.gradient = np.sum(dL_dF, 0)
|
||||||
|
|
||||||
# Backpropagation to hidden layer.
|
# Backpropagation to hidden layer.
|
||||||
delta_hid = np.dot(dL_dF, self.W2.T) * (1.0 - activations**2)
|
dL_dact = np.dot(dL_dF, self.W2.T)
|
||||||
|
dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
|
||||||
|
|
||||||
# Finally, evaluate the first-layer gradients.
|
# Finally, evaluate the first-layer gradients.
|
||||||
self.W1.gradients = np.dot(X.T,delta_hid)
|
self.W1.gradient = np.dot(X.T,dL_dlayer1)
|
||||||
self.b1.gradients = np.sum(delta_hid, 0)
|
self.b1.gradient = np.sum(dL_dlayer1, 0)
|
||||||
|
|
||||||
|
def gradients_X(self, dL_dF, X):
|
||||||
|
layer1 = np.dot(X,self.W1) + self.b1
|
||||||
|
activations = np.tanh(layer1)
|
||||||
|
|
||||||
|
# Backpropagation to hidden layer.
|
||||||
|
dL_dact = np.dot(dL_dF, self.W2.T)
|
||||||
|
dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
|
||||||
|
|
||||||
|
return np.dot(dL_dlayer1, self.W1.T)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -43,10 +43,11 @@ class SparseGPMiniBatch(SparseGP):
|
||||||
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
|
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
|
||||||
name='sparse gp', Y_metadata=None, normalizer=False,
|
name='sparse gp', Y_metadata=None, normalizer=False,
|
||||||
missing_data=False, stochastic=False, batchsize=1):
|
missing_data=False, stochastic=False, batchsize=1):
|
||||||
#pick a sensible inference method
|
|
||||||
|
# pick a sensible inference method
|
||||||
if inference_method is None:
|
if inference_method is None:
|
||||||
if isinstance(likelihood, likelihoods.Gaussian):
|
if isinstance(likelihood, likelihoods.Gaussian):
|
||||||
inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1])
|
inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
|
||||||
else:
|
else:
|
||||||
#inference_method = ??
|
#inference_method = ??
|
||||||
raise NotImplementedError, "what to do what to do?"
|
raise NotImplementedError, "what to do what to do?"
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,6 @@
|
||||||
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from ..util.warping_functions import *
|
from ..util.warping_functions import *
|
||||||
from ..core import GP
|
from ..core import GP
|
||||||
|
|
@ -10,14 +9,16 @@ from GPy.util.warping_functions import TanhWarpingFunction_d
|
||||||
from GPy import kern
|
from GPy import kern
|
||||||
|
|
||||||
class WarpedGP(GP):
|
class WarpedGP(GP):
|
||||||
def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False):
|
def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
|
||||||
|
|
||||||
if kernel is None:
|
if kernel is None:
|
||||||
kernel = kern.rbf(X.shape[1])
|
kernel = kern.RBF(X.shape[1])
|
||||||
|
|
||||||
if warping_function == None:
|
if warping_function == None:
|
||||||
self.warping_function = TanhWarpingFunction_d(warping_terms)
|
self.warping_function = TanhWarpingFunction_d(warping_terms)
|
||||||
self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
|
self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
|
||||||
|
else:
|
||||||
|
self.warping_function = warping_function
|
||||||
|
|
||||||
self.scale_data = False
|
self.scale_data = False
|
||||||
if self.scale_data:
|
if self.scale_data:
|
||||||
|
|
@ -25,10 +26,10 @@ class WarpedGP(GP):
|
||||||
self.has_uncertain_inputs = False
|
self.has_uncertain_inputs = False
|
||||||
self.Y_untransformed = Y.copy()
|
self.Y_untransformed = Y.copy()
|
||||||
self.predict_in_warped_space = False
|
self.predict_in_warped_space = False
|
||||||
likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y)
|
likelihood = likelihoods.Gaussian()
|
||||||
|
|
||||||
GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X)
|
GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
|
||||||
self._set_params(self._get_params())
|
self.link_parameter(self.warping_function)
|
||||||
|
|
||||||
def _scale_data(self, Y):
|
def _scale_data(self, Y):
|
||||||
self._Ymax = Y.max()
|
self._Ymax = Y.max()
|
||||||
|
|
@ -38,62 +39,55 @@ class WarpedGP(GP):
|
||||||
def _unscale_data(self, Y):
|
def _unscale_data(self, Y):
|
||||||
return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
|
return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
|
||||||
|
|
||||||
def _set_params(self, x):
|
def parameters_changed(self):
|
||||||
self.warping_params = x[:self.warping_function.num_parameters]
|
self.Y[:] = self.transform_data()
|
||||||
Y = self.transform_data()
|
super(WarpedGP, self).parameters_changed()
|
||||||
self.likelihood.set_data(Y)
|
|
||||||
GP._set_params(self, x[self.warping_function.num_parameters:].copy())
|
|
||||||
|
|
||||||
def _get_params(self):
|
Kiy = self.posterior.woodbury_vector.flatten()
|
||||||
return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
|
|
||||||
|
|
||||||
def _get_param_names(self):
|
grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
|
||||||
warping_names = self.warping_function._get_param_names()
|
grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
|
||||||
param_names = GP._get_param_names(self)
|
|
||||||
return warping_names + param_names
|
|
||||||
|
|
||||||
def transform_data(self):
|
|
||||||
Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
|
|
||||||
return Y
|
|
||||||
|
|
||||||
def log_likelihood(self):
|
|
||||||
ll = GP.log_likelihood(self)
|
|
||||||
jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
|
|
||||||
return ll + np.log(jacobian).sum()
|
|
||||||
|
|
||||||
def _log_likelihood_gradients(self):
|
|
||||||
ll_grads = GP._log_likelihood_gradients(self)
|
|
||||||
alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
|
|
||||||
warping_grads = self.warping_function_gradients(alpha)
|
|
||||||
|
|
||||||
warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
|
|
||||||
return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
|
|
||||||
|
|
||||||
def warping_function_gradients(self, Kiy):
|
|
||||||
grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
|
|
||||||
grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
|
|
||||||
return_covar_chain=True)
|
return_covar_chain=True)
|
||||||
djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
|
djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
|
||||||
dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
|
dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
|
||||||
|
|
||||||
return -dquad_dpsi + djac_dpsi
|
warping_grads = -dquad_dpsi + djac_dpsi
|
||||||
|
|
||||||
|
self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
|
||||||
|
self.warping_function.d.gradient[:] = warping_grads[0, -1]
|
||||||
|
|
||||||
|
|
||||||
|
def transform_data(self):
|
||||||
|
Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
|
||||||
|
return Y
|
||||||
|
|
||||||
|
def log_likelihood(self):
|
||||||
|
ll = GP.log_likelihood(self)
|
||||||
|
jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
|
||||||
|
return ll + np.log(jacobian).sum()
|
||||||
|
|
||||||
def plot_warping(self):
|
def plot_warping(self):
|
||||||
self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max())
|
self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
|
||||||
|
|
||||||
def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None):
|
def predict(self, Xnew, which_parts='all', pred_init=None):
|
||||||
# normalize X values
|
# normalize X values
|
||||||
Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
|
# Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
|
||||||
mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts)
|
mu, var = GP._raw_predict(self, Xnew)
|
||||||
|
|
||||||
# now push through likelihood
|
# now push through likelihood
|
||||||
mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov)
|
mean, var = self.likelihood.predictive_values(mu, var)
|
||||||
|
|
||||||
if self.predict_in_warped_space:
|
if self.predict_in_warped_space:
|
||||||
mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init)
|
mean = self.warping_function.f_inv(mean, y=pred_init)
|
||||||
var = self.warping_function.f_inv(var, self.warping_params)
|
var = self.warping_function.f_inv(var)
|
||||||
|
|
||||||
if self.scale_data:
|
if self.scale_data:
|
||||||
mean = self._unscale_data(mean)
|
mean = self._unscale_data(mean)
|
||||||
|
|
||||||
return mean, var, _025pm, _975pm
|
return mean, var
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
X = np.random.randn(100, 1)
|
||||||
|
Y = np.sin(X) + np.random.randn(100, 1)*0.05
|
||||||
|
|
||||||
|
m = WarpedGP(X, Y)
|
||||||
|
|
|
||||||
|
|
@ -6,7 +6,11 @@ try:
|
||||||
from matplotlib.patches import Polygon
|
from matplotlib.patches import Polygon
|
||||||
from matplotlib.collections import PatchCollection
|
from matplotlib.collections import PatchCollection
|
||||||
#from matplotlib import cm
|
#from matplotlib import cm
|
||||||
pb.ion()
|
try:
|
||||||
|
__IPYTHON__
|
||||||
|
pb.ion()
|
||||||
|
except NameError:
|
||||||
|
pass
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
import re
|
import re
|
||||||
|
|
|
||||||
|
|
@ -256,13 +256,23 @@ class KernelGradientTestsContinuous(unittest.TestCase):
|
||||||
k.randomize()
|
k.randomize()
|
||||||
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
|
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
|
||||||
|
|
||||||
|
def test_Prod1(self):
|
||||||
|
k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D)
|
||||||
|
k.randomize()
|
||||||
|
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
|
||||||
|
|
||||||
def test_Prod2(self):
|
def test_Prod2(self):
|
||||||
k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
|
k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)
|
||||||
k.randomize()
|
k.randomize()
|
||||||
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
|
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
|
||||||
|
|
||||||
def test_Prod3(self):
|
def test_Prod3(self):
|
||||||
k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
|
k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D) * GPy.kern.Bias(self.D)
|
||||||
|
k.randomize()
|
||||||
|
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
|
||||||
|
|
||||||
|
def test_Prod4(self):
|
||||||
|
k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D) * GPy.kern.Matern32(2, active_dims=[0,1])
|
||||||
k.randomize()
|
k.randomize()
|
||||||
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
|
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
|
||||||
|
|
||||||
|
|
@ -401,11 +411,27 @@ class Coregionalize_weave_test(unittest.TestCase):
|
||||||
GPy.util.config.config.set('weave', 'working', 'False')
|
GPy.util.config.config.set('weave', 'working', 'False')
|
||||||
|
|
||||||
|
|
||||||
|
class KernelTestsProductWithZeroValues(unittest.TestCase):
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.X = np.array([[0,1],[1,0]])
|
||||||
|
self.k = GPy.kern.Linear(2) * GPy.kern.Bias(2)
|
||||||
|
|
||||||
|
def test_zero_valued_kernel_full(self):
|
||||||
|
self.k.update_gradients_full(1, self.X)
|
||||||
|
self.assertFalse(np.isnan(self.k['linear.variances'].gradient),
|
||||||
|
"Gradient resulted in NaN")
|
||||||
|
|
||||||
|
def test_zero_valued_kernel_gradients_X(self):
|
||||||
|
target = self.k.gradients_X(1, self.X)
|
||||||
|
self.assertFalse(np.any(np.isnan(target)),
|
||||||
|
"Gradient resulted in NaN")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
print "Running unit tests, please be (very) patient..."
|
print "Running unit tests, please be (very) patient..."
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|
||||||
# np.random.seed(0)
|
# np.random.seed(0)
|
||||||
# N0 = 3
|
# N0 = 3
|
||||||
# N1 = 9
|
# N1 = 9
|
||||||
|
|
|
||||||
72
GPy/testing/mapping_tests.py
Normal file
72
GPy/testing/mapping_tests.py
Normal file
|
|
@ -0,0 +1,72 @@
|
||||||
|
# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
|
||||||
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
import unittest
|
||||||
|
import numpy as np
|
||||||
|
import GPy
|
||||||
|
|
||||||
|
class MappingGradChecker(GPy.core.Model):
|
||||||
|
"""
|
||||||
|
This class has everything we need to check the gradient of a mapping. It
|
||||||
|
implement a simple likelihood which is a weighted sum of the outputs of the
|
||||||
|
mapping. the gradients are checked against the parameters of the mapping
|
||||||
|
and the input.
|
||||||
|
"""
|
||||||
|
def __init__(self, mapping, X, name='map_grad_check'):
|
||||||
|
super(MappingGradChecker, self).__init__(name)
|
||||||
|
self.mapping = mapping
|
||||||
|
self.link_parameter(self.mapping)
|
||||||
|
self.X = GPy.core.Param('X',X)
|
||||||
|
self.link_parameter(self.X)
|
||||||
|
self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
|
||||||
|
def log_likelihood(self):
|
||||||
|
return np.sum(self.mapping.f(self.X) * self.dL_dY)
|
||||||
|
def parameters_changed(self):
|
||||||
|
self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
|
||||||
|
self.mapping.update_gradients(self.dL_dY, self.X)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class MappingTests(unittest.TestCase):
|
||||||
|
|
||||||
|
def test_kernelmapping(self):
|
||||||
|
X = np.random.randn(100,3)
|
||||||
|
Z = np.random.randn(10,3)
|
||||||
|
mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
|
||||||
|
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
|
||||||
|
|
||||||
|
def test_linearmapping(self):
|
||||||
|
mapping = GPy.mappings.Linear(3, 2)
|
||||||
|
X = np.random.randn(100,3)
|
||||||
|
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
|
||||||
|
|
||||||
|
def test_mlpmapping(self):
|
||||||
|
mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
|
||||||
|
X = np.random.randn(100,3)
|
||||||
|
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
|
||||||
|
|
||||||
|
def test_addmapping(self):
|
||||||
|
m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
|
||||||
|
m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
|
||||||
|
mapping = GPy.mappings.Additive(m1, m2)
|
||||||
|
X = np.random.randn(100,3)
|
||||||
|
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
|
||||||
|
|
||||||
|
def test_compoundmapping(self):
|
||||||
|
m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
|
||||||
|
Z = np.random.randn(10,2)
|
||||||
|
m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
|
||||||
|
mapping = GPy.mappings.Compound(m1, m2)
|
||||||
|
X = np.random.randn(100,3)
|
||||||
|
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
print "Running unit tests, please be (very) patient..."
|
||||||
|
unittest.main()
|
||||||
|
|
@ -32,3 +32,23 @@ class SVGP_classification(np.testing.TestCase):
|
||||||
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
|
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
|
||||||
def test_grad(self):
|
def test_grad(self):
|
||||||
assert self.m.checkgrad(step=1e-4)
|
assert self.m.checkgrad(step=1e-4)
|
||||||
|
|
||||||
|
class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
|
||||||
|
"""
|
||||||
|
Inference in the SVGP with a Bernoulli likelihood
|
||||||
|
"""
|
||||||
|
def setUp(self):
|
||||||
|
X = np.linspace(0,10,100).reshape(-1,1)
|
||||||
|
Z = np.linspace(0,10,10).reshape(-1,1)
|
||||||
|
latent_f = np.exp(0.1*X * 0.05*X**2)
|
||||||
|
Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
|
||||||
|
|
||||||
|
mf = GPy.mappings.Linear(1,1)
|
||||||
|
|
||||||
|
lik = GPy.likelihoods.Poisson()
|
||||||
|
k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
|
||||||
|
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
|
||||||
|
def test_grad(self):
|
||||||
|
assert self.m.checkgrad(step=1e-4)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -96,16 +96,21 @@ def jitchol(A, maxtries=5):
|
||||||
num_tries = 1
|
num_tries = 1
|
||||||
while num_tries <= maxtries and np.isfinite(jitter):
|
while num_tries <= maxtries and np.isfinite(jitter):
|
||||||
try:
|
try:
|
||||||
|
print jitter
|
||||||
L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
|
L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
|
||||||
logging.warning('Added {} rounds of jitter, jitter of {:.10e}\n'.format(num_tries, jitter))
|
|
||||||
return L
|
return L
|
||||||
except:
|
except:
|
||||||
jitter *= 10
|
jitter *= 10
|
||||||
|
finally:
|
||||||
num_tries += 1
|
num_tries += 1
|
||||||
|
raise linalg.LinAlgError, "not positive definite, even with jitter."
|
||||||
import traceback
|
import traceback
|
||||||
logging.warning('\n'.join(['Added {} rounds of jitter, jitter of {:.10e}'.format(num_tries-1, jitter),
|
try: raise
|
||||||
' in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
|
except:
|
||||||
raise linalg.LinAlgError, "not positive definite, even with jitter."
|
logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
|
||||||
|
' in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
|
||||||
|
import ipdb;ipdb.set_trace()
|
||||||
|
return L
|
||||||
|
|
||||||
# def dtrtri(L, lower=1):
|
# def dtrtri(L, lower=1):
|
||||||
# """
|
# """
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,18 @@
|
||||||
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
# Copyright (c) 2012, GPy authors (see AUTHORS.txt).
|
||||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||||
|
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
from GPy.core.parameterization import Parameterized, Param
|
||||||
|
from ..core.parameterization.transformations import Logexp
|
||||||
|
|
||||||
class WarpingFunction(object):
|
class WarpingFunction(Parameterized):
|
||||||
"""
|
"""
|
||||||
abstract function for warping
|
abstract function for warping
|
||||||
z = f(y)
|
z = f(y)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self, name):
|
||||||
raise NotImplementedError
|
super(WarpingFunction, self).__init__(name=name)
|
||||||
|
|
||||||
def f(self,y,psi):
|
def f(self,y,psi):
|
||||||
"""function transformation
|
"""function transformation
|
||||||
|
|
@ -34,9 +35,10 @@ class WarpingFunction(object):
|
||||||
def _get_param_names(self):
|
def _get_param_names(self):
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def plot(self, psi, xmin, xmax):
|
def plot(self, xmin, xmax):
|
||||||
|
psi = self.psi
|
||||||
y = np.arange(xmin, xmax, 0.01)
|
y = np.arange(xmin, xmax, 0.01)
|
||||||
f_y = self.f(y, psi)
|
f_y = self.f(y)
|
||||||
from matplotlib import pyplot as plt
|
from matplotlib import pyplot as plt
|
||||||
plt.figure()
|
plt.figure()
|
||||||
plt.plot(y, f_y)
|
plt.plot(y, f_y)
|
||||||
|
|
@ -50,6 +52,7 @@ class TanhWarpingFunction(WarpingFunction):
|
||||||
"""n_terms specifies the number of tanh terms to be used"""
|
"""n_terms specifies the number of tanh terms to be used"""
|
||||||
self.n_terms = n_terms
|
self.n_terms = n_terms
|
||||||
self.num_parameters = 3 * self.n_terms
|
self.num_parameters = 3 * self.n_terms
|
||||||
|
super(TanhWarpingFunction, self).__init__(name='warp_tanh')
|
||||||
|
|
||||||
def f(self,y,psi):
|
def f(self,y,psi):
|
||||||
"""
|
"""
|
||||||
|
|
@ -163,8 +166,18 @@ class TanhWarpingFunction_d(WarpingFunction):
|
||||||
"""n_terms specifies the number of tanh terms to be used"""
|
"""n_terms specifies the number of tanh terms to be used"""
|
||||||
self.n_terms = n_terms
|
self.n_terms = n_terms
|
||||||
self.num_parameters = 3 * self.n_terms + 1
|
self.num_parameters = 3 * self.n_terms + 1
|
||||||
|
self.psi = np.ones((self.n_terms, 3))
|
||||||
|
|
||||||
def f(self,y,psi):
|
super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
|
||||||
|
self.psi = Param('psi', self.psi)
|
||||||
|
self.psi[:, :2].constrain_positive()
|
||||||
|
|
||||||
|
self.d = Param('%s' % ('d'), 1.0, Logexp())
|
||||||
|
self.link_parameter(self.psi)
|
||||||
|
self.link_parameter(self.d)
|
||||||
|
|
||||||
|
|
||||||
|
def f(self,y):
|
||||||
"""
|
"""
|
||||||
Transform y with f using parameter vector psi
|
Transform y with f using parameter vector psi
|
||||||
psi = [[a,b,c]]
|
psi = [[a,b,c]]
|
||||||
|
|
@ -175,9 +188,9 @@ class TanhWarpingFunction_d(WarpingFunction):
|
||||||
#1. check that number of params is consistent
|
#1. check that number of params is consistent
|
||||||
# assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
|
# assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
|
||||||
# assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
|
# assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
|
||||||
mpsi = psi.copy()
|
|
||||||
d = psi[-1]
|
d = self.d
|
||||||
mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
|
mpsi = self.psi
|
||||||
|
|
||||||
#3. transform data
|
#3. transform data
|
||||||
z = d*y.copy()
|
z = d*y.copy()
|
||||||
|
|
@ -187,7 +200,7 @@ class TanhWarpingFunction_d(WarpingFunction):
|
||||||
return z
|
return z
|
||||||
|
|
||||||
|
|
||||||
def f_inv(self, z, psi, max_iterations=1000, y=None):
|
def f_inv(self, z, max_iterations=1000, y=None):
|
||||||
"""
|
"""
|
||||||
calculate the numerical inverse of f
|
calculate the numerical inverse of f
|
||||||
|
|
||||||
|
|
@ -198,12 +211,12 @@ class TanhWarpingFunction_d(WarpingFunction):
|
||||||
z = z.copy()
|
z = z.copy()
|
||||||
if y is None:
|
if y is None:
|
||||||
y = np.ones_like(z)
|
y = np.ones_like(z)
|
||||||
|
|
||||||
it = 0
|
it = 0
|
||||||
update = np.inf
|
update = np.inf
|
||||||
|
|
||||||
while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
|
while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
|
||||||
update = (self.f(y, psi) - z)/self.fgrad_y(y, psi)
|
update = (self.f(y) - z)/self.fgrad_y(y)
|
||||||
y -= update
|
y -= update
|
||||||
it += 1
|
it += 1
|
||||||
if it == max_iterations:
|
if it == max_iterations:
|
||||||
|
|
@ -212,7 +225,7 @@ class TanhWarpingFunction_d(WarpingFunction):
|
||||||
return y
|
return y
|
||||||
|
|
||||||
|
|
||||||
def fgrad_y(self, y, psi, return_precalc = False):
|
def fgrad_y(self, y,return_precalc = False):
|
||||||
"""
|
"""
|
||||||
gradient of f w.r.t to y ([N x 1])
|
gradient of f w.r.t to y ([N x 1])
|
||||||
|
|
||||||
|
|
@ -221,9 +234,8 @@ class TanhWarpingFunction_d(WarpingFunction):
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
mpsi = psi.copy()
|
d = self.d
|
||||||
d = psi[-1]
|
mpsi = self.psi
|
||||||
mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
|
|
||||||
|
|
||||||
# vectorized version
|
# vectorized version
|
||||||
|
|
||||||
|
|
@ -240,7 +252,7 @@ class TanhWarpingFunction_d(WarpingFunction):
|
||||||
return GRAD
|
return GRAD
|
||||||
|
|
||||||
|
|
||||||
def fgrad_y_psi(self, y, psi, return_covar_chain = False):
|
def fgrad_y_psi(self, y, return_covar_chain = False):
|
||||||
"""
|
"""
|
||||||
gradient of f w.r.t to y and psi
|
gradient of f w.r.t to y and psi
|
||||||
|
|
||||||
|
|
@ -248,10 +260,10 @@ class TanhWarpingFunction_d(WarpingFunction):
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
mpsi = psi.copy()
|
|
||||||
mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
|
|
||||||
|
|
||||||
w, s, r, d = self.fgrad_y(y, psi, return_precalc = True)
|
mpsi = self.psi
|
||||||
|
|
||||||
|
w, s, r, d = self.fgrad_y(y, return_precalc = True)
|
||||||
|
|
||||||
gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
|
gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
|
||||||
for i in range(len(mpsi)):
|
for i in range(len(mpsi)):
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue