Merge branch 'mean_functions' into devel

This commit is contained in:
James Hensman 2015-04-01 09:16:08 +01:00
commit 8c71d52b7f
27 changed files with 461 additions and 137 deletions

View file

@ -5,6 +5,7 @@ import numpy as np
import sys import sys
from .. import kern from .. import kern
from model import Model from model import Model
from mapping import Mapping
from parameterization import ObsAr from parameterization import ObsAr
from .. import likelihoods from .. import likelihoods
from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation from ..inference.latent_function_inference import exact_gaussian_inference, expectation_propagation
@ -34,7 +35,7 @@ class GP(Model):
""" """
def __init__(self, X, Y, kernel, likelihood, inference_method=None, name='gp', Y_metadata=None, normalizer=False): def __init__(self, X, Y, kernel, likelihood, mean_function=None, inference_method=None, name='gp', Y_metadata=None, normalizer=False):
super(GP, self).__init__(name) super(GP, self).__init__(name)
assert X.ndim == 2 assert X.ndim == 2
@ -75,6 +76,15 @@ class GP(Model):
assert isinstance(likelihood, likelihoods.Likelihood) assert isinstance(likelihood, likelihoods.Likelihood)
self.likelihood = likelihood self.likelihood = likelihood
#handle the mean function
self.mean_function = mean_function
if mean_function is not None:
assert isinstance(self.mean_function, Mapping)
assert mean_function.input_dim == self.input_dim
assert mean_function.output_dim == self.output_dim
self.link_parameter(mean_function)
#find a sensible inference method #find a sensible inference method
logger.info("initializing inference method") logger.info("initializing inference method")
if inference_method is None: if inference_method is None:
@ -153,9 +163,11 @@ class GP(Model):
This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call This method is not designed to be called manually, the framework is set up to automatically call this method upon changes to parameters, if you call
this method yourself, there may be unexpected consequences. this method yourself, there may be unexpected consequences.
""" """
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.Y_metadata) self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.kern, self.X, self.likelihood, self.Y_normalized, self.mean_function, self.Y_metadata)
self.likelihood.update_gradients(self.grad_dict['dL_dthetaL']) self.likelihood.update_gradients(self.grad_dict['dL_dthetaL'])
self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X) self.kern.update_gradients_full(self.grad_dict['dL_dK'], self.X)
if self.mean_function is not None:
self.mean_function.update_gradients(self.grad_dict['dL_dm'], self.X)
def log_likelihood(self): def log_likelihood(self):
""" """
@ -192,6 +204,10 @@ class GP(Model):
#force mu to be a column vector #force mu to be a column vector
if len(mu.shape)==1: mu = mu[:,None] if len(mu.shape)==1: mu = mu[:,None]
#add the mean function in
if not self.mean_function is None:
mu += self.mean_function.f(_Xnew)
return mu, var return mu, var
def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None): def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None):

View file

@ -19,7 +19,7 @@ class SparseGP(GP):
This model allows (approximate) inference using variational DTC or FITC This model allows (approximate) inference using variational DTC or FITC
(Gaussian likelihoods) as well as non-conjugate sparse methods based on (Gaussian likelihoods) as well as non-conjugate sparse methods based on
these. these.
This is not for missing data, as the implementation for missing data involves This is not for missing data, as the implementation for missing data involves
some inefficient optimization routine decisions. some inefficient optimization routine decisions.
See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'. See missing data SparseGP implementation in py:class:'~GPy.models.sparse_gp_minibatch.SparseGPMiniBatch'.
@ -39,7 +39,7 @@ class SparseGP(GP):
""" """
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, inference_method=None,
name='sparse gp', Y_metadata=None, normalizer=False): name='sparse gp', Y_metadata=None, normalizer=False):
#pick a sensible inference method #pick a sensible inference method
if inference_method is None: if inference_method is None:
@ -53,7 +53,7 @@ class SparseGP(GP):
self.Z = Param('inducing inputs', Z) self.Z = Param('inducing inputs', Z)
self.num_inducing = Z.shape[0] self.num_inducing = Z.shape[0]
GP.__init__(self, X, Y, kernel, likelihood, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer) GP.__init__(self, X, Y, kernel, likelihood, mean_function, inference_method=inference_method, name=name, Y_metadata=Y_metadata, normalizer=normalizer)
logger.info("Adding Z as parameter") logger.info("Adding Z as parameter")
self.link_parameter(self.Z, index=0) self.link_parameter(self.Z, index=0)
@ -61,7 +61,7 @@ class SparseGP(GP):
def has_uncertain_inputs(self): def has_uncertain_inputs(self):
return isinstance(self.X, VariationalPosterior) return isinstance(self.X, VariationalPosterior)
def set_Z(self, Z, trigger_update=True): def set_Z(self, Z, trigger_update=True):
if trigger_update: self.update_model(False) if trigger_update: self.update_model(False)
self.unlink_parameter(self.Z) self.unlink_parameter(self.Z)
@ -110,8 +110,8 @@ class SparseGP(GP):
def _raw_predict(self, Xnew, full_cov=False, kern=None): def _raw_predict(self, Xnew, full_cov=False, kern=None):
""" """
Make a prediction for the latent function values. Make a prediction for the latent function values.
For certain inputs we give back a full_cov of shape NxN, For certain inputs we give back a full_cov of shape NxN,
if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of, if there is missing data, each dimension has its own full_cov of shape NxNxD, and if full_cov is of,
we take only the diagonal elements across N. we take only the diagonal elements across N.
@ -136,6 +136,9 @@ class SparseGP(GP):
else: else:
Kxx = kern.Kdiag(Xnew) Kxx = kern.Kdiag(Xnew)
var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T
#add in the mean function
if self.mean_function is not None:
mu += self.mean_function.f(Xnew)
else: else:
psi0_star = self.kern.psi0(self.Z, Xnew) psi0_star = self.kern.psi0(self.Z, Xnew)
psi1_star = self.kern.psi1(self.Z, Xnew) psi1_star = self.kern.psi1(self.Z, Xnew)
@ -165,4 +168,5 @@ class SparseGP(GP):
var[i] = var_ var[i] = var_
else: else:
var[i] = np.diag(var_)+p0-t2 var[i] = np.diag(var_)+p0-t2
return mu, var return mu, var

View file

@ -9,7 +9,7 @@ from ..inference.latent_function_inference import SVGP as svgp_inf
class SVGP(SparseGP): class SVGP(SparseGP):
def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None, batchsize=None): def __init__(self, X, Y, Z, kernel, likelihood, mean_function=None, name='SVGP', Y_metadata=None, batchsize=None):
""" """
Stochastic Variational GP. Stochastic Variational GP.
@ -38,7 +38,7 @@ class SVGP(SparseGP):
#create the SVI inference method #create the SVI inference method
inf_method = svgp_inf() inf_method = svgp_inf()
SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, inference_method=inf_method, SparseGP.__init__(self, X_batch, Y_batch, Z, kernel, likelihood, mean_function=mean_function, inference_method=inf_method,
name=name, Y_metadata=Y_metadata, normalizer=False) name=name, Y_metadata=Y_metadata, normalizer=False)
self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1]))) self.m = Param('q_u_mean', np.zeros((self.num_inducing, Y.shape[1])))
@ -48,7 +48,7 @@ class SVGP(SparseGP):
self.link_parameter(self.m) self.link_parameter(self.m)
def parameters_changed(self): def parameters_changed(self):
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0])) self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.mean_function, self.Y_metadata, KL_scale=1.0, batch_scale=float(self.X_all.shape[0])/float(self.X.shape[0]))
#update the kernel gradients #update the kernel gradients
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z) self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
@ -65,6 +65,13 @@ class SVGP(SparseGP):
self.m.gradient = self.grad_dict['dL_dm'] self.m.gradient = self.grad_dict['dL_dm']
self.chol.gradient = self.grad_dict['dL_dchol'] self.chol.gradient = self.grad_dict['dL_dchol']
if self.mean_function is not None:
self.mean_function.update_gradients(self.grad_dict['dL_dmfX'], self.X)
g = self.mean_function.gradient[:].copy()
self.mean_function.update_gradients(self.grad_dict['dL_dmfZ'], self.Z)
self.mean_function.gradient[:] += g
self.Z.gradient[:] += self.mean_function.gradients_X(self.grad_dict['dL_dmfZ'], self.Z)
def set_data(self, X, Y): def set_data(self, X, Y):
""" """
Set the data without calling parameters_changed to avoid wasted computation Set the data without calling parameters_changed to avoid wasted computation

View file

@ -505,3 +505,48 @@ def uncertain_inputs_sparse_regression(max_iters=200, optimize=True, plot=True):
print m print m
return m return m
def simple_mean_function(max_iters=100, optimize=True, plot=True):
"""
The simplest possible mean function. No parameters, just a simple Sinusoid.
"""
#create simple mean function
mf = GPy.core.Mapping(1,1)
mf.f = np.sin
mf.update_gradients = lambda a,b: None
X = np.linspace(0,10,50).reshape(-1,1)
Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape)
k =GPy.kern.RBF(1)
lik = GPy.likelihoods.Gaussian()
m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
if optimize:
m.optimize(max_iters=max_iters)
if plot:
m.plot(plot_limits=(-10,15))
return m
def parametric_mean_function(max_iters=100, optimize=True, plot=True):
"""
A linear mean function with parameters that we'll learn alongside the kernel
"""
#create simple mean function
mf = GPy.core.Mapping(1,1)
mf.f = np.sin
X = np.linspace(0,10,50).reshape(-1,1)
Y = np.sin(X) + 0.5*np.cos(3*X) + 0.1*np.random.randn(*X.shape) + 3*X
mf = GPy.mappings.Linear(1,1)
k =GPy.kern.RBF(1)
lik = GPy.likelihoods.Gaussian()
m = GPy.core.GP(X, Y, kernel=k, likelihood=lik, mean_function=mf)
if optimize:
m.optimize(max_iters=max_iters)
if plot:
m.plot()
return m

View file

@ -20,7 +20,8 @@ class DTC(LatentFunctionInference):
def __init__(self): def __init__(self):
self.const_jitter = 1e-6 self.const_jitter = 1e-6
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
assert mean_function is None, "inference with a mean function not implemented"
assert X_variance is None, "cannot use X_variance with DTC. Try varDTC." assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
num_inducing, _ = Z.shape num_inducing, _ = Z.shape
@ -88,7 +89,8 @@ class vDTC(object):
def __init__(self): def __init__(self):
self.const_jitter = 1e-6 self.const_jitter = 1e-6
def inference(self, kern, X, X_variance, Z, likelihood, Y, Y_metadata): def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
assert mean_function is None, "inference with a mean function not implemented"
assert X_variance is None, "cannot use X_variance with DTC. Try varDTC." assert X_variance is None, "cannot use X_variance with DTC. Try varDTC."
num_inducing, _ = Z.shape num_inducing, _ = Z.shape

View file

@ -36,11 +36,18 @@ class ExactGaussianInference(LatentFunctionInference):
#print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!" #print "WARNING: N>D of Y, we need caching of L, such that L*L^T = Y, returning Y still!"
return Y return Y
def inference(self, kern, X, likelihood, Y, Y_metadata=None): def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
""" """
Returns a Posterior class containing essential quantities of the posterior Returns a Posterior class containing essential quantities of the posterior
""" """
YYT_factor = self.get_YYTfactor(Y)
if mean_function is None:
m = 0
else:
m = mean_function.f(X)
YYT_factor = self.get_YYTfactor(Y-m)
K = kern.K(X) K = kern.K(X)
@ -56,4 +63,4 @@ class ExactGaussianInference(LatentFunctionInference):
dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata) dL_dthetaL = likelihood.exact_inference_gradients(np.diag(dL_dK),Y_metadata)
return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL} return Posterior(woodbury_chol=LW, woodbury_vector=alpha, K=K), log_marginal, {'dL_dK':dL_dK, 'dL_dthetaL':dL_dthetaL, 'dL_dm':alpha}

View file

@ -33,7 +33,8 @@ class EP(LatentFunctionInference):
# TODO: update approximation in the end as well? Maybe even with a switch? # TODO: update approximation in the end as well? Maybe even with a switch?
pass pass
def inference(self, kern, X, likelihood, Y, Y_metadata=None, Z=None): def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None, Z=None):
assert mean_function is None, "inference with a mean function not implemented"
num_data, output_dim = Y.shape num_data, output_dim = Y.shape
assert output_dim ==1, "ep in 1D only (for now!)" assert output_dim ==1, "ep in 1D only (for now!)"

View file

@ -64,7 +64,8 @@ class EPDTC(LatentFunctionInference):
self.old_mutilde, self.old_vtilde = None, None self.old_mutilde, self.old_vtilde = None, None
self._ep_approximation = None self._ep_approximation = None
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
assert mean_function is None, "inference with a mean function not implemented"
num_data, output_dim = Y.shape num_data, output_dim = Y.shape
assert output_dim ==1, "ep in 1D only (for now!)" assert output_dim ==1, "ep in 1D only (for now!)"

View file

@ -18,7 +18,8 @@ class FITC(LatentFunctionInference):
""" """
const_jitter = 1e-6 const_jitter = 1e-6
def inference(self, kern, X, Z, likelihood, Y, Y_metadata=None): def inference(self, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None):
assert mean_function is None, "inference with a mean function not implemented"
num_inducing, _ = Z.shape num_inducing, _ = Z.shape
num_data, output_dim = Y.shape num_data, output_dim = Y.shape

View file

@ -39,10 +39,12 @@ class Laplace(LatentFunctionInference):
self.first_run = True self.first_run = True
self._previous_Ki_fhat = None self._previous_Ki_fhat = None
def inference(self, kern, X, likelihood, Y, Y_metadata=None): def inference(self, kern, X, likelihood, Y, mean_function=None, Y_metadata=None):
""" """
Returns a Posterior class containing essential quantities of the posterior Returns a Posterior class containing essential quantities of the posterior
""" """
assert mean_function is None, "inference with a mean function not implemented"
# Compute K # Compute K
K = kern.K(X) K = kern.K(X)

View file

@ -15,7 +15,7 @@ class Posterior(object):
the function at any new point x_* by integrating over this posterior. the function at any new point x_* by integrating over this posterior.
""" """
def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None): def __init__(self, woodbury_chol=None, woodbury_vector=None, K=None, mean=None, cov=None, K_chol=None, woodbury_inv=None, prior_mean=0):
""" """
woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K woodbury_chol : a lower triangular matrix L that satisfies posterior_covariance = K - K L^{-T} L^{-1} K
woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M woodbury_vector : a matrix (or vector, as Nx1 matrix) M which satisfies posterior_mean = K M
@ -67,6 +67,7 @@ class Posterior(object):
#option 2: #option 2:
self._mean = mean self._mean = mean
self._covariance = cov self._covariance = cov
self._prior_mean = prior_mean
#compute this lazily #compute this lazily
self._precision = None self._precision = None
@ -175,7 +176,7 @@ class Posterior(object):
$$ $$
""" """
if self._woodbury_vector is None: if self._woodbury_vector is None:
self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean) self._woodbury_vector, _ = dpotrs(self.K_chol, self.mean - self._prior_mean)
return self._woodbury_vector return self._woodbury_vector
@property @property

View file

@ -6,7 +6,8 @@ from posterior import Posterior
class SVGP(LatentFunctionInference): class SVGP(LatentFunctionInference):
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None, KL_scale=1.0, batch_scale=1.0): def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, mean_function=None, Y_metadata=None, KL_scale=1.0, batch_scale=1.0):
num_inducing = Z.shape[0] num_inducing = Z.shape[0]
num_data, num_outputs = Y.shape num_data, num_outputs = Y.shape
@ -22,6 +23,15 @@ class SVGP(LatentFunctionInference):
#S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S)) #S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
#Si, Lnew, _,_ = linalg.pdinv(S) #Si, Lnew, _,_ = linalg.pdinv(S)
#compute mean function stuff
if mean_function is not None:
prior_mean_u = mean_function.f(Z)
prior_mean_f = mean_function.f(X)
else:
prior_mean_u = np.zeros((num_inducing, num_outputs))
prior_mean_f = np.zeros((num_data, num_outputs))
#compute kernel related stuff #compute kernel related stuff
Kmm = kern.K(Z) Kmm = kern.K(Z)
Knm = kern.K(X, Z) Knm = kern.K(X, Z)
@ -30,17 +40,31 @@ class SVGP(LatentFunctionInference):
#compute the marginal means and variances of q(f) #compute the marginal means and variances of q(f)
A = np.dot(Knm, Kmmi) A = np.dot(Knm, Kmmi)
mu = np.dot(A, q_u_mean) mu = prior_mean_f + np.dot(A, q_u_mean - prior_mean_u)
v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1) v = Knn_diag[:,None] - np.sum(A*Knm,1)[:,None] + np.sum(A[:,:,None] * np.einsum('ij,jkl->ikl', A, S),1)
#compute the KL term #compute the KL term
Kmmim = np.dot(Kmmi, q_u_mean) Kmmim = np.dot(Kmmi, q_u_mean)
KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0) KLs = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.einsum('ij,ijk->k', Kmmi, S) + 0.5*np.sum(q_u_mean*Kmmim,0)
KL = KLs.sum() KL = KLs.sum()
dKL_dm = Kmmim #gradient of the KL term (assuming zero mean function)
dKL_dm = Kmmim.copy()
dKL_dS = 0.5*(Kmmi[:,:,None] - Si) dKL_dS = 0.5*(Kmmi[:,:,None] - Si)
dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T) dKL_dKmm = 0.5*num_outputs*Kmmi - 0.5*Kmmi.dot(S.sum(-1)).dot(Kmmi) - 0.5*Kmmim.dot(Kmmim.T)
if mean_function is not None:
#adjust KL term for mean function
Kmmi_mfZ = np.dot(Kmmi, prior_mean_u)
KL += -np.sum(q_u_mean*Kmmi_mfZ)
KL += 0.5*np.sum(Kmmi_mfZ*prior_mean_u)
#adjust gradient for mean fucntion
dKL_dm -= Kmmi_mfZ
dKL_dKmm += Kmmim.dot(Kmmi_mfZ.T)
dKL_dKmm -= 0.5*Kmmi_mfZ.dot(Kmmi_mfZ.T)
#compute gradients for mean_function
dKL_dmfZ = Kmmi_mfZ - Kmmim
#quadrature for the likelihood #quadrature for the likelihood
F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata) F, dF_dmu, dF_dv, dF_dthetaL = likelihood.variational_expectations(Y, mu, v, Y_metadata=Y_metadata)
@ -50,11 +74,9 @@ class SVGP(LatentFunctionInference):
if dF_dthetaL is not None: if dF_dthetaL is not None:
dF_dthetaL = dF_dthetaL.sum(1).sum(1)*batch_scale dF_dthetaL = dF_dthetaL.sum(1).sum(1)*batch_scale
#derivatives of expected likelihood #derivatives of expected likelihood, assuming zero mean function
Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal Adv = A.T[:,:,None]*dF_dv[None,:,:] # As if dF_Dv is diagonal
Admu = A.T.dot(dF_dmu) Admu = A.T.dot(dF_dmu)
#AdvA = np.einsum('ijk,jl->ilk', Adv, A)
#AdvA = np.dot(A.T, Adv).swapaxes(0,1)
AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)]) AdvA = np.dstack([np.dot(A.T, Adv[:,:,i].T) for i in range(num_outputs)])
tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi) tmp = np.einsum('ijk,jlk->il', AdvA, S).dot(Kmmi)
dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T dF_dKmm = -Admu.dot(Kmmim.T) + AdvA.sum(-1) - tmp - tmp.T
@ -64,6 +86,14 @@ class SVGP(LatentFunctionInference):
dF_dm = Admu dF_dm = Admu
dF_dS = AdvA dF_dS = AdvA
#adjust gradient to account for mean function
if mean_function is not None:
dF_dmfX = dF_dmu.copy()
dF_dmfZ = -Admu
dF_dKmn -= np.dot(Kmmi_mfZ, dF_dmu.T)
dF_dKmm += Admu.dot(Kmmi_mfZ.T)
#sum (gradients of) expected likelihood and KL part #sum (gradients of) expected likelihood and KL part
log_marginal = F.sum() - KL log_marginal = F.sum() - KL
dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
@ -71,4 +101,8 @@ class SVGP(LatentFunctionInference):
dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)]) dL_dchol = np.dstack([2.*np.dot(dL_dS[:,:,i], L[:,:,i]) for i in range(num_outputs)])
dL_dchol = choleskies.triang_to_flat(dL_dchol) dL_dchol = choleskies.triang_to_flat(dL_dchol)
return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL} grad_dict = {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv.sum(1), 'dL_dm':dL_dm, 'dL_dchol':dL_dchol, 'dL_dthetaL':dF_dthetaL}
if mean_function is not None:
grad_dict['dL_dmfZ'] = dF_dmfZ - dKL_dmfZ
grad_dict['dL_dmfX'] = dF_dmfX
return Posterior(mean=q_u_mean, cov=S, K=Kmm, prior_mean=prior_mean_u), log_marginal, grad_dict

View file

@ -6,6 +6,20 @@ from kern import CombinationKernel
from ...util.caching import Cache_this from ...util.caching import Cache_this
import itertools import itertools
def numpy_invalid_op_as_exception(func):
"""
A decorator that allows catching numpy invalid operations
as exceptions (the default behaviour is raising warnings).
"""
def func_wrapper(*args, **kwargs):
np.seterr(invalid='raise')
result = func(*args, **kwargs)
np.seterr(invalid='warn')
return result
return func_wrapper
class Prod(CombinationKernel): class Prod(CombinationKernel):
""" """
Computes the product of 2 kernels Computes the product of 2 kernels
@ -46,18 +60,20 @@ class Prod(CombinationKernel):
self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2) self.parts[0].update_gradients_full(dL_dK*self.parts[1].K(X,X2), X, X2)
self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2) self.parts[1].update_gradients_full(dL_dK*self.parts[0].K(X,X2), X, X2)
else: else:
k = self.K(X,X2)*dL_dK for combination in itertools.combinations(self.parts, len(self.parts) - 1):
for p in self.parts: prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
p.update_gradients_full(k/p.K(X,X2),X,X2) to_update = list(set(self.parts) - set(combination))[0]
to_update.update_gradients_full(dL_dK * prod, X, X2)
def update_gradients_diag(self, dL_dKdiag, X): def update_gradients_diag(self, dL_dKdiag, X):
if len(self.parts)==2: if len(self.parts)==2:
self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X) self.parts[0].update_gradients_diag(dL_dKdiag*self.parts[1].Kdiag(X), X)
self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X) self.parts[1].update_gradients_diag(dL_dKdiag*self.parts[0].Kdiag(X), X)
else: else:
k = self.Kdiag(X)*dL_dKdiag for combination in itertools.combinations(self.parts, len(self.parts) - 1):
for p in self.parts: prod = reduce(np.multiply, [p.Kdiag(X) for p in combination])
p.update_gradients_diag(k/p.Kdiag(X),X) to_update = list(set(self.parts) - set(combination))[0]
to_update.update_gradients_diag(dL_dKdiag * prod, X)
def gradients_X(self, dL_dK, X, X2=None): def gradients_X(self, dL_dK, X, X2=None):
target = np.zeros(X.shape) target = np.zeros(X.shape)
@ -65,9 +81,10 @@ class Prod(CombinationKernel):
target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2) target += self.parts[0].gradients_X(dL_dK*self.parts[1].K(X, X2), X, X2)
target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2) target += self.parts[1].gradients_X(dL_dK*self.parts[0].K(X, X2), X, X2)
else: else:
k = self.K(X,X2)*dL_dK for combination in itertools.combinations(self.parts, len(self.parts) - 1):
for p in self.parts: prod = reduce(np.multiply, [p.K(X, X2) for p in combination])
target += p.gradients_X(k/p.K(X,X2),X,X2) to_update = list(set(self.parts) - set(combination))[0]
target += to_update.gradients_X(dL_dK * prod, X, X2)
return target return target
def gradients_X_diag(self, dL_dKdiag, X): def gradients_X_diag(self, dL_dKdiag, X):
@ -80,3 +97,5 @@ class Prod(CombinationKernel):
for p in self.parts: for p in self.parts:
target += p.gradients_X_diag(k/p.Kdiag(X),X) target += p.gradients_X_diag(k/p.Kdiag(X),X)
return target return target

View file

@ -4,4 +4,5 @@
from kernel import Kernel from kernel import Kernel
from linear import Linear from linear import Linear
from mlp import MLP from mlp import MLP
#from rbf import RBF from additive import Additive
from compound import Compound

View file

@ -2,8 +2,7 @@
# Licensed under the BSD 3-clause license (see LICENSE.txt) # Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np import numpy as np
from ..core.mapping import Mapping from ..core import Mapping
import GPy
class Additive(Mapping): class Additive(Mapping):
""" """
@ -27,8 +26,6 @@ class Additive(Mapping):
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim) Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
self.mapping1 = mapping1 self.mapping1 = mapping1
self.mapping2 = mapping2 self.mapping2 = mapping2
self.num_params = self.mapping1.num_params + self.mapping2.num_params
self.name = self.mapping1.name + '+' + self.mapping2.name
def f(self, X): def f(self, X):
return self.mapping1.f(X) + self.mapping2.f(X) return self.mapping1.f(X) + self.mapping2.f(X)

39
GPy/mappings/compound.py Normal file
View file

@ -0,0 +1,39 @@
# Copyright (c) 2015, James Hensman and Alan Saul
# Licensed under the BSD 3-clause license (see LICENSE.txt)
from ..core import Mapping
class Compound(Mapping):
"""
Mapping based on passing one mapping through another
.. math::
f(\mathbf{x}) = f_2(f_1(\mathbf{x}))
:param mapping1: first mapping
:type mapping1: GPy.mappings.Mapping
:param mapping2: second mapping
:type mapping2: GPy.mappings.Mapping
"""
def __init__(self, mapping1, mapping2):
assert(mapping1.output_dim==mapping2.input_dim)
input_dim, output_dim = mapping1.input_dim, mapping2.output_dim
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim)
self.mapping1 = mapping1
self.mapping2 = mapping2
self.link_parameters(self.mapping1, self.mapping2)
def f(self, X):
return self.mapping2.f(self.mapping1.f(X))
def update_gradients(self, dL_dF, X):
hidden = self.mapping1.f(X)
self.mapping2.update_gradients(dL_dF, hidden)
self.mapping1.update_gradients(self.mapping2.gradients_X(dL_dF, hidden), X)
def gradients_X(self, dL_dF, X):
hidden = self.mapping1.f(X)
return self.mapping1.gradients_X(self.mapping2.gradients_X(dL_dF, hidden), X)

View file

@ -36,16 +36,16 @@ class Kernel(Mapping):
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
self.kern = kernel self.kern = kernel
self.Z = Z self.Z = Z
self.num_bases, Zdim = X.shape self.num_bases, Zdim = Z.shape
assert Zdim == self.input_dim assert Zdim == self.input_dim
self.A = GPy.core.Param('A', np.random.randn(self.num_bases, self.output_dim)) self.A = Param('A', np.random.randn(self.num_bases, self.output_dim))
self.add_parameter(self.A) self.link_parameter(self.A)
def f(self, X): def f(self, X):
return np.dot(self.kern.K(X, self.Z), self.A) return np.dot(self.kern.K(X, self.Z), self.A)
def update_gradients(self, dL_dF, X): def update_gradients(self, dL_dF, X):
self.kern.update_gradients_full(np.dot(dL_dF, self.A.T)) self.kern.update_gradients_full(np.dot(dL_dF, self.A.T), X, self.Z)
self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF) self.A.gradient = np.dot( self.kern.K(self.Z, X), dL_dF)
def gradients_X(self, dL_dF, X): def gradients_X(self, dL_dF, X):

View file

@ -26,8 +26,8 @@ class Linear(Mapping):
def __init__(self, input_dim, output_dim, name='linmap'): def __init__(self, input_dim, output_dim, name='linmap'):
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
self.A = GPy.core.Param('A', np.random.randn(self.input_dim, self.output_dim)) self.A = Param('A', np.random.randn(self.input_dim, self.output_dim))
self.add_parameter(self.A) self.link_parameter(self.A)
def f(self, X): def f(self, X):
return np.dot(X, self.A) return np.dot(X, self.A)

View file

@ -11,32 +11,45 @@ class MLP(Mapping):
""" """
def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'): def __init__(self, input_dim=1, output_dim=1, hidden_dim=3, name='mlpmap'):
super(MLP).__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) super(MLP, self).__init__(input_dim=input_dim, output_dim=output_dim, name=name)
self.hidden_dim = hidden_dim self.hidden_dim = hidden_dim
self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim)) self.W1 = Param('W1', np.random.randn(self.input_dim, self.hidden_dim))
self.b1 = Param('b1', np.random.randn(self.hidden_dim)) self.b1 = Param('b1', np.random.randn(self.hidden_dim))
self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim)) self.W2 = Param('W2', np.random.randn(self.hidden_dim, self.output_dim))
self.b2 = Param('b2', np.random.randn(self.output_dim)) self.b2 = Param('b2', np.random.randn(self.output_dim))
self.link_parameters(self.W1, self.b1, self.W2, self.b2)
def f(self, X): def f(self, X):
N, D = X.shape layer1 = np.dot(X, self.W1) + self.b1
activations = np.tanh(np.dot(X,self.W1) + self.b1) activations = np.tanh(layer1)
self.out = np.dot(self.activations,self.W2) + self.b2 return np.dot(activations, self.W2) + self.b2
return self.output_fn(self.out)
def update_gradients(self, dL_dF, X): def update_gradients(self, dL_dF, X):
activations = np.tanh(np.dot(X,self.W1) + self.b1) layer1 = np.dot(X,self.W1) + self.b1
activations = np.tanh(layer1)
#Evaluate second-layer gradients. #Evaluate second-layer gradients.
self.W2.gradient = np.dot(activations.T, dL_dF) self.W2.gradient = np.dot(activations.T, dL_dF)
self.b2.gradient = np.sum(dL_dF, 0) self.b2.gradient = np.sum(dL_dF, 0)
# Backpropagation to hidden layer. # Backpropagation to hidden layer.
delta_hid = np.dot(dL_dF, self.W2.T) * (1.0 - activations**2) dL_dact = np.dot(dL_dF, self.W2.T)
dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
# Finally, evaluate the first-layer gradients. # Finally, evaluate the first-layer gradients.
self.W1.gradients = np.dot(X.T,delta_hid) self.W1.gradient = np.dot(X.T,dL_dlayer1)
self.b1.gradients = np.sum(delta_hid, 0) self.b1.gradient = np.sum(dL_dlayer1, 0)
def gradients_X(self, dL_dF, X):
layer1 = np.dot(X,self.W1) + self.b1
activations = np.tanh(layer1)
# Backpropagation to hidden layer.
dL_dact = np.dot(dL_dF, self.W2.T)
dL_dlayer1 = dL_dact / np.square(np.cosh(layer1))
return np.dot(dL_dlayer1, self.W1.T)

View file

@ -43,10 +43,11 @@ class SparseGPMiniBatch(SparseGP):
def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None, def __init__(self, X, Y, Z, kernel, likelihood, inference_method=None,
name='sparse gp', Y_metadata=None, normalizer=False, name='sparse gp', Y_metadata=None, normalizer=False,
missing_data=False, stochastic=False, batchsize=1): missing_data=False, stochastic=False, batchsize=1):
#pick a sensible inference method
# pick a sensible inference method
if inference_method is None: if inference_method is None:
if isinstance(likelihood, likelihoods.Gaussian): if isinstance(likelihood, likelihoods.Gaussian):
inference_method = var_dtc.VarDTC(limit=1 if not self.missing_data else Y.shape[1]) inference_method = var_dtc.VarDTC(limit=1 if not missing_data else Y.shape[1])
else: else:
#inference_method = ?? #inference_method = ??
raise NotImplementedError, "what to do what to do?" raise NotImplementedError, "what to do what to do?"

View file

@ -1,7 +1,6 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt) # Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np import numpy as np
from ..util.warping_functions import * from ..util.warping_functions import *
from ..core import GP from ..core import GP
@ -10,14 +9,16 @@ from GPy.util.warping_functions import TanhWarpingFunction_d
from GPy import kern from GPy import kern
class WarpedGP(GP): class WarpedGP(GP):
def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3, normalize_X=False, normalize_Y=False): def __init__(self, X, Y, kernel=None, warping_function=None, warping_terms=3):
if kernel is None: if kernel is None:
kernel = kern.rbf(X.shape[1]) kernel = kern.RBF(X.shape[1])
if warping_function == None: if warping_function == None:
self.warping_function = TanhWarpingFunction_d(warping_terms) self.warping_function = TanhWarpingFunction_d(warping_terms)
self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1) self.warping_params = (np.random.randn(self.warping_function.n_terms * 3 + 1,) * 1)
else:
self.warping_function = warping_function
self.scale_data = False self.scale_data = False
if self.scale_data: if self.scale_data:
@ -25,10 +26,10 @@ class WarpedGP(GP):
self.has_uncertain_inputs = False self.has_uncertain_inputs = False
self.Y_untransformed = Y.copy() self.Y_untransformed = Y.copy()
self.predict_in_warped_space = False self.predict_in_warped_space = False
likelihood = likelihoods.Gaussian(self.transform_data(), normalize=normalize_Y) likelihood = likelihoods.Gaussian()
GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X) GP.__init__(self, X, self.transform_data(), likelihood=likelihood, kernel=kernel)
self._set_params(self._get_params()) self.link_parameter(self.warping_function)
def _scale_data(self, Y): def _scale_data(self, Y):
self._Ymax = Y.max() self._Ymax = Y.max()
@ -38,62 +39,55 @@ class WarpedGP(GP):
def _unscale_data(self, Y): def _unscale_data(self, Y):
return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin return (Y + 0.5) * (self._Ymax - self._Ymin) + self._Ymin
def _set_params(self, x): def parameters_changed(self):
self.warping_params = x[:self.warping_function.num_parameters] self.Y[:] = self.transform_data()
Y = self.transform_data() super(WarpedGP, self).parameters_changed()
self.likelihood.set_data(Y)
GP._set_params(self, x[self.warping_function.num_parameters:].copy())
def _get_params(self): Kiy = self.posterior.woodbury_vector.flatten()
return np.hstack((self.warping_params.flatten().copy(), GP._get_params(self).copy()))
def _get_param_names(self): grad_y = self.warping_function.fgrad_y(self.Y_untransformed)
warping_names = self.warping_function._get_param_names() grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed,
param_names = GP._get_param_names(self)
return warping_names + param_names
def transform_data(self):
Y = self.warping_function.f(self.Y_untransformed.copy(), self.warping_params).copy()
return Y
def log_likelihood(self):
ll = GP.log_likelihood(self)
jacobian = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
return ll + np.log(jacobian).sum()
def _log_likelihood_gradients(self):
ll_grads = GP._log_likelihood_gradients(self)
alpha = np.dot(self.Ki, self.likelihood.Y.flatten())
warping_grads = self.warping_function_gradients(alpha)
warping_grads = np.append(warping_grads[:, :-1].flatten(), warping_grads[0, -1])
return np.hstack((warping_grads.flatten(), ll_grads.flatten()))
def warping_function_gradients(self, Kiy):
grad_y = self.warping_function.fgrad_y(self.Y_untransformed, self.warping_params)
grad_y_psi, grad_psi = self.warping_function.fgrad_y_psi(self.Y_untransformed, self.warping_params,
return_covar_chain=True) return_covar_chain=True)
djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0) djac_dpsi = ((1.0 / grad_y[:, :, None, None]) * grad_y_psi).sum(axis=0).sum(axis=0)
dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0) dquad_dpsi = (Kiy[:, None, None, None] * grad_psi).sum(axis=0).sum(axis=0)
return -dquad_dpsi + djac_dpsi warping_grads = -dquad_dpsi + djac_dpsi
self.warping_function.psi.gradient[:] = warping_grads[:, :-1]
self.warping_function.d.gradient[:] = warping_grads[0, -1]
def transform_data(self):
Y = self.warping_function.f(self.Y_untransformed.copy()).copy()
return Y
def log_likelihood(self):
ll = GP.log_likelihood(self)
jacobian = self.warping_function.fgrad_y(self.Y_untransformed)
return ll + np.log(jacobian).sum()
def plot_warping(self): def plot_warping(self):
self.warping_function.plot(self.warping_params, self.Y_untransformed.min(), self.Y_untransformed.max()) self.warping_function.plot(self.Y_untransformed.min(), self.Y_untransformed.max())
def predict(self, Xnew, which_parts='all', full_cov=False, pred_init=None): def predict(self, Xnew, which_parts='all', pred_init=None):
# normalize X values # normalize X values
Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale # Xnew = (Xnew.copy() - self._Xoffset) / self._Xscale
mu, var = GP._raw_predict(self, Xnew, full_cov=full_cov, which_parts=which_parts) mu, var = GP._raw_predict(self, Xnew)
# now push through likelihood # now push through likelihood
mean, var, _025pm, _975pm = self.likelihood.predictive_values(mu, var, full_cov) mean, var = self.likelihood.predictive_values(mu, var)
if self.predict_in_warped_space: if self.predict_in_warped_space:
mean = self.warping_function.f_inv(mean, self.warping_params, y=pred_init) mean = self.warping_function.f_inv(mean, y=pred_init)
var = self.warping_function.f_inv(var, self.warping_params) var = self.warping_function.f_inv(var)
if self.scale_data: if self.scale_data:
mean = self._unscale_data(mean) mean = self._unscale_data(mean)
return mean, var, _025pm, _975pm return mean, var
if __name__ == '__main__':
X = np.random.randn(100, 1)
Y = np.sin(X) + np.random.randn(100, 1)*0.05
m = WarpedGP(X, Y)

View file

@ -6,7 +6,11 @@ try:
from matplotlib.patches import Polygon from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection from matplotlib.collections import PatchCollection
#from matplotlib import cm #from matplotlib import cm
pb.ion() try:
__IPYTHON__
pb.ion()
except NameError:
pass
except: except:
pass pass
import re import re

View file

@ -256,13 +256,23 @@ class KernelGradientTestsContinuous(unittest.TestCase):
k.randomize() k.randomize()
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
def test_Prod1(self):
k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D)
k.randomize()
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
def test_Prod2(self): def test_Prod2(self):
k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)) k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)
k.randomize() k.randomize()
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
def test_Prod3(self): def test_Prod3(self):
k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)) k = GPy.kern.RBF(self.D) * GPy.kern.Linear(self.D) * GPy.kern.Bias(self.D)
k.randomize()
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
def test_Prod4(self):
k = GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D) * GPy.kern.Matern32(2, active_dims=[0,1])
k.randomize() k.randomize()
self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
@ -401,11 +411,27 @@ class Coregionalize_weave_test(unittest.TestCase):
GPy.util.config.config.set('weave', 'working', 'False') GPy.util.config.config.set('weave', 'working', 'False')
class KernelTestsProductWithZeroValues(unittest.TestCase):
def setUp(self):
self.X = np.array([[0,1],[1,0]])
self.k = GPy.kern.Linear(2) * GPy.kern.Bias(2)
def test_zero_valued_kernel_full(self):
self.k.update_gradients_full(1, self.X)
self.assertFalse(np.isnan(self.k['linear.variances'].gradient),
"Gradient resulted in NaN")
def test_zero_valued_kernel_gradients_X(self):
target = self.k.gradients_X(1, self.X)
self.assertFalse(np.any(np.isnan(target)),
"Gradient resulted in NaN")
if __name__ == "__main__": if __name__ == "__main__":
print "Running unit tests, please be (very) patient..." print "Running unit tests, please be (very) patient..."
unittest.main() unittest.main()
# np.random.seed(0) # np.random.seed(0)
# N0 = 3 # N0 = 3
# N1 = 9 # N1 = 9

View file

@ -0,0 +1,72 @@
# Copyright (c) 2012, 2013 GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import unittest
import numpy as np
import GPy
class MappingGradChecker(GPy.core.Model):
"""
This class has everything we need to check the gradient of a mapping. It
implement a simple likelihood which is a weighted sum of the outputs of the
mapping. the gradients are checked against the parameters of the mapping
and the input.
"""
def __init__(self, mapping, X, name='map_grad_check'):
super(MappingGradChecker, self).__init__(name)
self.mapping = mapping
self.link_parameter(self.mapping)
self.X = GPy.core.Param('X',X)
self.link_parameter(self.X)
self.dL_dY = np.random.randn(self.X.shape[0], self.mapping.output_dim)
def log_likelihood(self):
return np.sum(self.mapping.f(self.X) * self.dL_dY)
def parameters_changed(self):
self.X.gradient = self.mapping.gradients_X(self.dL_dY, self.X)
self.mapping.update_gradients(self.dL_dY, self.X)
class MappingTests(unittest.TestCase):
def test_kernelmapping(self):
X = np.random.randn(100,3)
Z = np.random.randn(10,3)
mapping = GPy.mappings.Kernel(3, 2, Z, GPy.kern.RBF(3))
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
def test_linearmapping(self):
mapping = GPy.mappings.Linear(3, 2)
X = np.random.randn(100,3)
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
def test_mlpmapping(self):
mapping = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
X = np.random.randn(100,3)
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
def test_addmapping(self):
m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
m2 = GPy.mappings.Linear(input_dim=3, output_dim=2)
mapping = GPy.mappings.Additive(m1, m2)
X = np.random.randn(100,3)
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
def test_compoundmapping(self):
m1 = GPy.mappings.MLP(input_dim=3, hidden_dim=5, output_dim=2)
Z = np.random.randn(10,2)
m2 = GPy.mappings.Kernel(2, 4, Z, GPy.kern.RBF(2))
mapping = GPy.mappings.Compound(m1, m2)
X = np.random.randn(100,3)
self.assertTrue(MappingGradChecker(mapping, X).checkgrad())
if __name__ == "__main__":
print "Running unit tests, please be (very) patient..."
unittest.main()

View file

@ -32,3 +32,23 @@ class SVGP_classification(np.testing.TestCase):
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k) self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k)
def test_grad(self): def test_grad(self):
assert self.m.checkgrad(step=1e-4) assert self.m.checkgrad(step=1e-4)
class SVGP_Poisson_with_meanfunction(np.testing.TestCase):
"""
Inference in the SVGP with a Bernoulli likelihood
"""
def setUp(self):
X = np.linspace(0,10,100).reshape(-1,1)
Z = np.linspace(0,10,10).reshape(-1,1)
latent_f = np.exp(0.1*X * 0.05*X**2)
Y = np.array([np.random.poisson(f) for f in latent_f.flatten()]).reshape(-1,1)
mf = GPy.mappings.Linear(1,1)
lik = GPy.likelihoods.Poisson()
k = GPy.kern.RBF(1, lengthscale=5.) + GPy.kern.White(1, 1e-6)
self.m = GPy.core.SVGP(X, Y, Z=Z, likelihood=lik, kernel=k, mean_function=mf)
def test_grad(self):
assert self.m.checkgrad(step=1e-4)

View file

@ -96,16 +96,21 @@ def jitchol(A, maxtries=5):
num_tries = 1 num_tries = 1
while num_tries <= maxtries and np.isfinite(jitter): while num_tries <= maxtries and np.isfinite(jitter):
try: try:
print jitter
L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True) L = linalg.cholesky(A + np.eye(A.shape[0]) * jitter, lower=True)
logging.warning('Added {} rounds of jitter, jitter of {:.10e}\n'.format(num_tries, jitter))
return L return L
except: except:
jitter *= 10 jitter *= 10
finally:
num_tries += 1 num_tries += 1
raise linalg.LinAlgError, "not positive definite, even with jitter."
import traceback import traceback
logging.warning('\n'.join(['Added {} rounds of jitter, jitter of {:.10e}'.format(num_tries-1, jitter), try: raise
' in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]])) except:
raise linalg.LinAlgError, "not positive definite, even with jitter." logging.warning('\n'.join(['Added jitter of {:.10e}'.format(jitter),
' in '+traceback.format_list(traceback.extract_stack(limit=2)[-2:-1])[0][2:]]))
import ipdb;ipdb.set_trace()
return L
# def dtrtri(L, lower=1): # def dtrtri(L, lower=1):
# """ # """

View file

@ -1,17 +1,18 @@
# Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt) # Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np import numpy as np
from GPy.core.parameterization import Parameterized, Param
from ..core.parameterization.transformations import Logexp
class WarpingFunction(object): class WarpingFunction(Parameterized):
""" """
abstract function for warping abstract function for warping
z = f(y) z = f(y)
""" """
def __init__(self): def __init__(self, name):
raise NotImplementedError super(WarpingFunction, self).__init__(name=name)
def f(self,y,psi): def f(self,y,psi):
"""function transformation """function transformation
@ -34,9 +35,10 @@ class WarpingFunction(object):
def _get_param_names(self): def _get_param_names(self):
raise NotImplementedError raise NotImplementedError
def plot(self, psi, xmin, xmax): def plot(self, xmin, xmax):
psi = self.psi
y = np.arange(xmin, xmax, 0.01) y = np.arange(xmin, xmax, 0.01)
f_y = self.f(y, psi) f_y = self.f(y)
from matplotlib import pyplot as plt from matplotlib import pyplot as plt
plt.figure() plt.figure()
plt.plot(y, f_y) plt.plot(y, f_y)
@ -50,6 +52,7 @@ class TanhWarpingFunction(WarpingFunction):
"""n_terms specifies the number of tanh terms to be used""" """n_terms specifies the number of tanh terms to be used"""
self.n_terms = n_terms self.n_terms = n_terms
self.num_parameters = 3 * self.n_terms self.num_parameters = 3 * self.n_terms
super(TanhWarpingFunction, self).__init__(name='warp_tanh')
def f(self,y,psi): def f(self,y,psi):
""" """
@ -163,8 +166,18 @@ class TanhWarpingFunction_d(WarpingFunction):
"""n_terms specifies the number of tanh terms to be used""" """n_terms specifies the number of tanh terms to be used"""
self.n_terms = n_terms self.n_terms = n_terms
self.num_parameters = 3 * self.n_terms + 1 self.num_parameters = 3 * self.n_terms + 1
self.psi = np.ones((self.n_terms, 3))
def f(self,y,psi): super(TanhWarpingFunction_d, self).__init__(name='warp_tanh')
self.psi = Param('psi', self.psi)
self.psi[:, :2].constrain_positive()
self.d = Param('%s' % ('d'), 1.0, Logexp())
self.link_parameter(self.psi)
self.link_parameter(self.d)
def f(self,y):
""" """
Transform y with f using parameter vector psi Transform y with f using parameter vector psi
psi = [[a,b,c]] psi = [[a,b,c]]
@ -175,9 +188,9 @@ class TanhWarpingFunction_d(WarpingFunction):
#1. check that number of params is consistent #1. check that number of params is consistent
# assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions' # assert psi.shape[0] == self.n_terms, 'inconsistent parameter dimensions'
# assert psi.shape[1] == 4, 'inconsistent parameter dimensions' # assert psi.shape[1] == 4, 'inconsistent parameter dimensions'
mpsi = psi.copy()
d = psi[-1] d = self.d
mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3) mpsi = self.psi
#3. transform data #3. transform data
z = d*y.copy() z = d*y.copy()
@ -187,7 +200,7 @@ class TanhWarpingFunction_d(WarpingFunction):
return z return z
def f_inv(self, z, psi, max_iterations=1000, y=None): def f_inv(self, z, max_iterations=1000, y=None):
""" """
calculate the numerical inverse of f calculate the numerical inverse of f
@ -198,12 +211,12 @@ class TanhWarpingFunction_d(WarpingFunction):
z = z.copy() z = z.copy()
if y is None: if y is None:
y = np.ones_like(z) y = np.ones_like(z)
it = 0 it = 0
update = np.inf update = np.inf
while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations): while it == 0 or (np.abs(update).sum() > 1e-10 and it < max_iterations):
update = (self.f(y, psi) - z)/self.fgrad_y(y, psi) update = (self.f(y) - z)/self.fgrad_y(y)
y -= update y -= update
it += 1 it += 1
if it == max_iterations: if it == max_iterations:
@ -212,7 +225,7 @@ class TanhWarpingFunction_d(WarpingFunction):
return y return y
def fgrad_y(self, y, psi, return_precalc = False): def fgrad_y(self, y,return_precalc = False):
""" """
gradient of f w.r.t to y ([N x 1]) gradient of f w.r.t to y ([N x 1])
@ -221,9 +234,8 @@ class TanhWarpingFunction_d(WarpingFunction):
""" """
mpsi = psi.copy() d = self.d
d = psi[-1] mpsi = self.psi
mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
# vectorized version # vectorized version
@ -240,7 +252,7 @@ class TanhWarpingFunction_d(WarpingFunction):
return GRAD return GRAD
def fgrad_y_psi(self, y, psi, return_covar_chain = False): def fgrad_y_psi(self, y, return_covar_chain = False):
""" """
gradient of f w.r.t to y and psi gradient of f w.r.t to y and psi
@ -248,10 +260,10 @@ class TanhWarpingFunction_d(WarpingFunction):
""" """
mpsi = psi.copy()
mpsi = mpsi[:self.num_parameters-1].reshape(self.n_terms, 3)
w, s, r, d = self.fgrad_y(y, psi, return_precalc = True) mpsi = self.psi
w, s, r, d = self.fgrad_y(y, return_precalc = True)
gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4)) gradients = np.zeros((y.shape[0], y.shape[1], len(mpsi), 4))
for i in range(len(mpsi)): for i in range(len(mpsi)):