mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-05-21 14:05:14 +02:00
Merge branch 'devel' of github.com:SheffieldML/GPy into devel
This commit is contained in:
commit
e52dad80f3
35 changed files with 1142 additions and 7905 deletions
|
|
@ -51,6 +51,44 @@ def linear(input_dim,variances=None,ARD=False):
|
|||
part = parts.linear.Linear(input_dim,variances,ARD)
|
||||
return kern(input_dim, [part])
|
||||
|
||||
def mlp(input_dim,variance=1., weight_variance=None,bias_variance=100.,ARD=False):
|
||||
"""
|
||||
Construct an MLP kernel
|
||||
|
||||
:param input_dim: dimensionality of the kernel, obligatory
|
||||
:type input_dim: int
|
||||
:param variance: the variance of the kernel
|
||||
:type variance: float
|
||||
:param weight_scale: the lengthscale of the kernel
|
||||
:type weight_scale: vector of weight variances for input weights in neural network (length 1 if kernel is isotropic)
|
||||
:param bias_variance: the variance of the biases in the neural network.
|
||||
:type bias_variance: float
|
||||
:param ARD: Auto Relevance Determination (allows for ARD version of covariance)
|
||||
:type ARD: Boolean
|
||||
"""
|
||||
part = parts.mlp.MLP(input_dim,variance,weight_variance,bias_variance,ARD)
|
||||
return kern(input_dim, [part])
|
||||
|
||||
def poly(input_dim,variance=1., weight_variance=None,bias_variance=1.,degree=2, ARD=False):
|
||||
"""
|
||||
Construct a polynomial kernel
|
||||
|
||||
:param input_dim: dimensionality of the kernel, obligatory
|
||||
:type input_dim: int
|
||||
:param variance: the variance of the kernel
|
||||
:type variance: float
|
||||
:param weight_scale: the lengthscale of the kernel
|
||||
:type weight_scale: vector of weight variances for input weights.
|
||||
:param bias_variance: the variance of the biases.
|
||||
:type bias_variance: float
|
||||
:param degree: the degree of the polynomial
|
||||
:type degree: int
|
||||
:param ARD: Auto Relevance Determination (allows for ARD version of covariance)
|
||||
:type ARD: Boolean
|
||||
"""
|
||||
part = parts.poly.POLY(input_dim,variance,weight_variance,bias_variance,degree,ARD)
|
||||
return kern(input_dim, [part])
|
||||
|
||||
def white(input_dim,variance=1.):
|
||||
"""
|
||||
Construct a white kernel.
|
||||
|
|
@ -253,6 +291,8 @@ def prod(k1,k2,tensor=False):
|
|||
|
||||
:param k1, k2: the kernels to multiply
|
||||
:type k1, k2: kernpart
|
||||
:param tensor: The kernels are either multiply as functions defined on the same input space (default) or on the product of the input spaces
|
||||
:type tensor: Boolean
|
||||
:rtype: kernel object
|
||||
"""
|
||||
part = parts.prod.Prod(k1, k2, tensor)
|
||||
|
|
@ -260,13 +300,13 @@ def prod(k1,k2,tensor=False):
|
|||
|
||||
def symmetric(k):
|
||||
"""
|
||||
Construct a symmetrical kernel from an existing kernel
|
||||
Construct a symmetric kernel from an existing kernel
|
||||
"""
|
||||
k_ = k.copy()
|
||||
k_.parts = [symmetric.Symmetric(p) for p in k.parts]
|
||||
return k_
|
||||
|
||||
def coregionalise(Nout,R=1, W=None, kappa=None):
|
||||
def coregionalise(Nout, R=1, W=None, kappa=None):
|
||||
p = parts.coregionalise.Coregionalise(Nout,R,W,kappa)
|
||||
return kern(1,[p])
|
||||
|
||||
|
|
@ -291,11 +331,13 @@ def fixed(input_dim, K, variance=1.):
|
|||
"""
|
||||
Construct a Fixed effect kernel.
|
||||
|
||||
Arguments
|
||||
---------
|
||||
input_dim (int), obligatory
|
||||
K (np.array), obligatory
|
||||
variance (float)
|
||||
:param input_dim: the number of input dimensions
|
||||
:type input_dim: int (input_dim=1 is the only value currently supported)
|
||||
:param K: the variance :math:`\sigma^2`
|
||||
:type K: np.array
|
||||
:param variance: kernel variance
|
||||
:type variance: float
|
||||
:rtype: kern object
|
||||
"""
|
||||
part = parts.fixed.Fixed(input_dim, K, variance)
|
||||
return kern(input_dim, [part])
|
||||
|
|
@ -318,7 +360,7 @@ def independent_outputs(k):
|
|||
|
||||
def hierarchical(k):
|
||||
"""
|
||||
Construct a kernel with independent outputs from an existing kernel
|
||||
TODO THis can't be right! Construct a kernel with independent outputs from an existing kernel
|
||||
"""
|
||||
# for sl in k.input_slices:
|
||||
# assert (sl.start is None) and (sl.stop is None), "cannot adjust input slices! (TODO)"
|
||||
|
|
|
|||
|
|
@ -14,7 +14,10 @@ class kern(Parameterized):
|
|||
"""
|
||||
This is the main kernel class for GPy. It handles multiple (additive) kernel functions, and keeps track of variaous things like which parameters live where.
|
||||
|
||||
The technical code for kernels is divided into _parts_ (see e.g. rbf.py). This obnject contains a list of parts, which are computed additively. For multiplication, special _prod_ parts are used.
|
||||
The technical code for kernels is divided into _parts_ (see
|
||||
e.g. rbf.py). This object contains a list of parts, which are
|
||||
computed additively. For multiplication, special _prod_ parts
|
||||
are used.
|
||||
|
||||
:param input_dim: The dimensionality of the kernel's input space
|
||||
:type input_dim: int
|
||||
|
|
@ -149,7 +152,7 @@ class kern(Parameterized):
|
|||
return g
|
||||
|
||||
def compute_param_slices(self):
|
||||
"""create a set of slices that can index the parameters of each part"""
|
||||
"""create a set of slices that can index the parameters of each part."""
|
||||
self.param_slices = []
|
||||
count = 0
|
||||
for p in self.parts:
|
||||
|
|
@ -200,11 +203,19 @@ class kern(Parameterized):
|
|||
"""
|
||||
return self.prod(other)
|
||||
|
||||
def __pow__(self, other, tensor=False):
|
||||
"""
|
||||
Shortcut for tensor `prod`.
|
||||
"""
|
||||
return self.prod(other, tensor=True)
|
||||
|
||||
def prod(self, other, tensor=False):
|
||||
"""
|
||||
multiply two kernels (either on the same space, or on the tensor product of the input space)
|
||||
multiply two kernels (either on the same space, or on the tensor product of the input space).
|
||||
:param other: the other kernel to be added
|
||||
:type other: GPy.kern
|
||||
:param tensor: whether or not to use the tensor space (default is false).
|
||||
:type tensor: bool
|
||||
"""
|
||||
K1 = self.copy()
|
||||
K2 = other.copy()
|
||||
|
|
@ -273,7 +284,7 @@ class kern(Parameterized):
|
|||
[p._set_params(x[s]) for p, s in zip(self.parts, self.param_slices)]
|
||||
|
||||
def _get_param_names(self):
|
||||
# this is a bit nasty: we wat to distinguish between parts with the same name by appending a count
|
||||
# this is a bit nasty: we want to distinguish between parts with the same name by appending a count
|
||||
part_names = np.array([k.name for k in self.parts], dtype=np.str)
|
||||
counts = [np.sum(part_names == ni) for i, ni in enumerate(part_names)]
|
||||
cum_counts = [np.sum(part_names[i:] == ni) for i, ni in enumerate(part_names)]
|
||||
|
|
@ -295,11 +306,13 @@ class kern(Parameterized):
|
|||
|
||||
def dK_dtheta(self, dL_dK, X, X2=None):
|
||||
"""
|
||||
:param dL_dK: An array of dL_dK derivaties, dL_dK
|
||||
:type dL_dK: Np.ndarray (N x num_inducing)
|
||||
Compute the gradient of the covariance function with respect to the parameters.
|
||||
|
||||
:param dL_dK: An array of gradients of the objective function with respect to the covariance function.
|
||||
:type dL_dK: Np.ndarray (num_samples x num_inducing)
|
||||
:param X: Observed data inputs
|
||||
:type X: np.ndarray (N x input_dim)
|
||||
:param X2: Observed dara inputs (optional, defaults to X)
|
||||
:type X: np.ndarray (num_samples x input_dim)
|
||||
:param X2: Observed data inputs (optional, defaults to X)
|
||||
:type X2: np.ndarray (num_inducing x input_dim)
|
||||
"""
|
||||
assert X.shape[1] == self.input_dim
|
||||
|
|
@ -312,6 +325,14 @@ class kern(Parameterized):
|
|||
return self._transform_gradients(target)
|
||||
|
||||
def dK_dX(self, dL_dK, X, X2=None):
|
||||
"""Compute the gradient of the covariance function with respect to X.
|
||||
|
||||
:param dL_dK: An array of gradients of the objective function with respect to the covariance function.
|
||||
:type dL_dK: np.ndarray (num_samples x num_inducing)
|
||||
:param X: Observed data inputs
|
||||
:type X: np.ndarray (num_samples x input_dim)
|
||||
:param X2: Observed data inputs (optional, defaults to X)
|
||||
:type X2: np.ndarray (num_inducing x input_dim)"""
|
||||
if X2 is None:
|
||||
X2 = X
|
||||
target = np.zeros_like(X)
|
||||
|
|
@ -322,6 +343,7 @@ class kern(Parameterized):
|
|||
return target
|
||||
|
||||
def Kdiag(self, X, which_parts='all'):
|
||||
"""Compute the diagonal of the covariance function for inputs X."""
|
||||
if which_parts == 'all':
|
||||
which_parts = [True] * self.Nparts
|
||||
assert X.shape[1] == self.input_dim
|
||||
|
|
@ -330,6 +352,7 @@ class kern(Parameterized):
|
|||
return target
|
||||
|
||||
def dKdiag_dtheta(self, dL_dKdiag, X):
|
||||
"""Compute the gradient of the diagonal of the covariance function with respect to the parameters."""
|
||||
assert X.shape[1] == self.input_dim
|
||||
assert dL_dKdiag.size == X.shape[0]
|
||||
target = np.zeros(self.num_params)
|
||||
|
|
@ -373,16 +396,18 @@ class kern(Parameterized):
|
|||
return target
|
||||
|
||||
def dpsi1_dmuS(self, dL_dpsi1, Z, mu, S):
|
||||
"""return shapes are N,num_inducing,input_dim"""
|
||||
"""return shapes are num_samples,num_inducing,input_dim"""
|
||||
target_mu, target_S = np.zeros((2, mu.shape[0], mu.shape[1]))
|
||||
[p.dpsi1_dmuS(dL_dpsi1, Z[:, i_s], mu[:, i_s], S[:, i_s], target_mu[:, i_s], target_S[:, i_s]) for p, i_s in zip(self.parts, self.input_slices)]
|
||||
return target_mu, target_S
|
||||
|
||||
def psi2(self, Z, mu, S):
|
||||
"""
|
||||
Computer the psi2 statistics for the covariance function.
|
||||
|
||||
:param Z: np.ndarray of inducing inputs (num_inducing x input_dim)
|
||||
:param mu, S: np.ndarrays of means and variances (each N x input_dim)
|
||||
:returns psi2: np.ndarray (N,num_inducing,num_inducing)
|
||||
:param mu, S: np.ndarrays of means and variances (each num_samples x input_dim)
|
||||
:returns psi2: np.ndarray (num_samples,num_inducing,num_inducing)
|
||||
"""
|
||||
target = np.zeros((mu.shape[0], Z.shape[0], Z.shape[0]))
|
||||
[p.psi2(Z[:, i_s], mu[:, i_s], S[:, i_s], target) for p, i_s in zip(self.parts, self.input_slices)]
|
||||
|
|
@ -406,6 +431,7 @@ class kern(Parameterized):
|
|||
return target
|
||||
|
||||
def dpsi2_dtheta(self, dL_dpsi2, Z, mu, S):
|
||||
"""Gradient of the psi2 statistics with respect to the parameters."""
|
||||
target = np.zeros(self.num_params)
|
||||
[p.dpsi2_dtheta(dL_dpsi2, Z[:, i_s], mu[:, i_s], S[:, i_s], target[ps]) for p, i_s, ps in zip(self.parts, self.input_slices, self.param_slices)]
|
||||
|
||||
|
|
@ -509,3 +535,4 @@ class kern(Parameterized):
|
|||
pb.title("k(x1,x2 ; %0.1f,%0.1f)" % (x[0, 0], x[0, 1]))
|
||||
else:
|
||||
raise NotImplementedError, "Cannot plot a kernel with more than two input dimensions"
|
||||
|
||||
|
|
|
|||
|
|
@ -8,9 +8,11 @@ import independent_outputs
|
|||
import linear
|
||||
import Matern32
|
||||
import Matern52
|
||||
import mlp
|
||||
import periodic_exponential
|
||||
import periodic_Matern32
|
||||
import periodic_Matern52
|
||||
import poly
|
||||
import prod_orthogonal
|
||||
import prod
|
||||
import rational_quadratic
|
||||
|
|
|
|||
|
|
@ -29,7 +29,11 @@ class Kernpart(object):
|
|||
def dK_dtheta(self,dL_dK,X,X2,target):
|
||||
raise NotImplementedError
|
||||
def dKdiag_dtheta(self,dL_dKdiag,X,target):
|
||||
raise NotImplementedError
|
||||
# In the base case compute this by calling dK_dtheta. Need to
|
||||
# override for stationary covariances (for example) to save
|
||||
# time.
|
||||
for i in range(X.shape[0]):
|
||||
self.dK_dtheta(dL_dKdiag[i], X[i, :][None, :], X2=None, target=target)
|
||||
def psi0(self,Z,mu,S,target):
|
||||
raise NotImplementedError
|
||||
def dpsi0_dtheta(self,dL_dpsi0,Z,mu,S,target):
|
||||
|
|
@ -52,5 +56,21 @@ class Kernpart(object):
|
|||
raise NotImplementedError
|
||||
def dpsi2_dmuS(self,dL_dpsi2,Z,mu,S,target_mu,target_S):
|
||||
raise NotImplementedError
|
||||
def dK_dX(self,X,X2,target):
|
||||
def dK_dX(self, dL_dK, X, X2, target):
|
||||
raise NotImplementedError
|
||||
|
||||
class Kernpart_inner(Kernpart):
|
||||
def __init__(self,input_dim):
|
||||
"""
|
||||
The base class for a kernpart_inner: a positive definite function which forms part of a kernel that is based on the inner product between inputs.
|
||||
|
||||
:param input_dim: the number of input dimensions to the function
|
||||
:type input_dim: int
|
||||
|
||||
Do not instantiate.
|
||||
"""
|
||||
Kernpart.__init__(self, input_dim)
|
||||
|
||||
# initialize cache
|
||||
self._Z, self._mu, self._S = np.empty(shape=(3, 1))
|
||||
self._X, self._X2, self._params = np.empty(shape=(3, 1))
|
||||
|
|
|
|||
164
GPy/kern/parts/mlp.py
Normal file
164
GPy/kern/parts/mlp.py
Normal file
|
|
@ -0,0 +1,164 @@
|
|||
# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
|
||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||
|
||||
from kernpart import Kernpart
|
||||
import numpy as np
|
||||
four_over_tau = 2./np.pi
|
||||
|
||||
class MLP(Kernpart):
|
||||
"""
|
||||
multi layer perceptron kernel (also known as arc sine kernel or neural network kernel)
|
||||
|
||||
.. math::
|
||||
|
||||
k(x,y) = \sigma^2 \frac{2}{\pi} \text{asin} \left(\frac{\sigma_w^2 x^\top y+\sigma_b^2}{\sqrt{\sigma_w^2x^\top x + \sigma_b^2 + 1}\sqrt{\sigma_w^2 y^\top y \sigma_b^2 +1}} \right)
|
||||
|
||||
:param input_dim: the number of input dimensions
|
||||
:type input_dim: int
|
||||
:param variance: the variance :math:`\sigma^2`
|
||||
:type variance: float
|
||||
:param weight_variance: the vector of the variances of the prior over input weights in the neural network :math:`\sigma^2_w`
|
||||
:type weight_variance: array or list of the appropriate size (or float if there is only one weight variance parameter)
|
||||
:param bias_variance: the variance of the prior over bias parameters :math:`\sigma^2_b`
|
||||
:param ARD: Auto Relevance Determination. If equal to "False", the kernel is isotropic (ie. one weight variance parameter \sigma^2_w), otherwise there is one weight variance parameter per dimension.
|
||||
:type ARD: Boolean
|
||||
:rtype: Kernpart object
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, input_dim, variance=1., weight_variance=None, bias_variance=100., ARD=False):
|
||||
ARD = False
|
||||
self.input_dim = input_dim
|
||||
self.ARD = ARD
|
||||
if not ARD:
|
||||
self.num_params=3
|
||||
if weight_variance is not None:
|
||||
weight_variance = np.asarray(weight_variance)
|
||||
assert weight_variance.size == 1, "Only one weight variance needed for non-ARD kernel"
|
||||
else:
|
||||
weight_variance = 100.*np.ones(1)
|
||||
else:
|
||||
self.num_params = self.input_dim + 2
|
||||
if weight_variance is not None:
|
||||
weight_variance = np.asarray(weight_variance)
|
||||
assert weight_variance.size == self.input_dim, "bad number of weight variances"
|
||||
else:
|
||||
weight_variance = np.ones(self.input_dim)
|
||||
|
||||
self.name='mlp'
|
||||
self._set_params(np.hstack((variance, weight_variance.flatten(), bias_variance)))
|
||||
|
||||
def _get_params(self):
|
||||
return np.hstack((self.variance, self.weight_variance.flatten(), self.bias_variance))
|
||||
|
||||
def _set_params(self, x):
|
||||
assert x.size == (self.num_params)
|
||||
self.variance = x[0]
|
||||
self.weight_variance = x[1:-1]
|
||||
self.weight_std = np.sqrt(self.weight_variance)
|
||||
self.bias_variance = x[-1]
|
||||
|
||||
def _get_param_names(self):
|
||||
if self.num_params == 3:
|
||||
return ['variance', 'weight_variance', 'bias_variance']
|
||||
else:
|
||||
return ['variance'] + ['weight_variance_%i' % i for i in range(self.lengthscale.size)] + ['bias_variance']
|
||||
|
||||
def K(self, X, X2, target):
|
||||
"""Return covariance between X and X2."""
|
||||
self._K_computations(X, X2)
|
||||
target += self.variance*self._K_dvar
|
||||
|
||||
def Kdiag(self, X, target):
|
||||
"""Compute the diagonal of the covariance matrix for X."""
|
||||
self._K_diag_computations(X)
|
||||
target+= self.variance*self._K_diag_dvar
|
||||
|
||||
def dK_dtheta(self, dL_dK, X, X2, target):
|
||||
"""Derivative of the covariance with respect to the parameters."""
|
||||
self._K_computations(X, X2)
|
||||
denom3 = self._K_denom*self._K_denom*self._K_denom
|
||||
base = four_over_tau*self.variance/np.sqrt(1-self._K_asin_arg*self._K_asin_arg)
|
||||
base_cov_grad = base*dL_dK
|
||||
|
||||
if X2 is None:
|
||||
vec = np.diag(self._K_inner_prod)
|
||||
target[1] += ((self._K_inner_prod/self._K_denom
|
||||
-.5*self._K_numer/denom3
|
||||
*(np.outer((self.weight_variance*vec+self.bias_variance+1.), vec)
|
||||
+np.outer(vec,(self.weight_variance*vec+self.bias_variance+1.))))*base_cov_grad).sum()
|
||||
target[2] += ((1./self._K_denom
|
||||
-.5*self._K_numer/denom3
|
||||
*((vec[None, :]+vec[:, None])*self.weight_variance
|
||||
+2.*self.bias_variance + 2.))*base_cov_grad).sum()
|
||||
else:
|
||||
vec1 = (X*X).sum(1)
|
||||
vec2 = (X2*X2).sum(1)
|
||||
target[1] += ((self._K_inner_prod/self._K_denom
|
||||
-.5*self._K_numer/denom3
|
||||
*(np.outer((self.weight_variance*vec1+self.bias_variance+1.), vec2) + np.outer(vec1, self.weight_variance*vec2 + self.bias_variance+1.)))*base_cov_grad).sum()
|
||||
target[2] += ((1./self._K_denom
|
||||
-.5*self._K_numer/denom3
|
||||
*((vec1[:, None]+vec2[None, :])*self.weight_variance
|
||||
+ 2*self.bias_variance + 2.))*base_cov_grad).sum()
|
||||
|
||||
target[0] += np.sum(self._K_dvar*dL_dK)
|
||||
|
||||
|
||||
def dK_dX(self, dL_dK, X, X2, target):
|
||||
"""Derivative of the covariance matrix with respect to X"""
|
||||
raise NotImplementedError
|
||||
# self._K_computations(X, X2)
|
||||
# gX = np.zeros((X2.shape[0], X.shape[1], X.shape[0]))
|
||||
|
||||
# for i in range(X.shape[0]):
|
||||
# gX[:, :, i] = self._dK_dX_point(dL_dK, X, X2, target, i)
|
||||
|
||||
|
||||
def _dK_dX_point(self, dL_dK, X, X2, target, i):
|
||||
"""Gradient with respect to one point of X"""
|
||||
|
||||
inner_prod = self._K_inner_prod[i, :].T
|
||||
numer = self._K_numer[i, :].T
|
||||
denom = self._K_denom[i, :].T
|
||||
arg = self._K_asin_arg[i, :].T
|
||||
vec1 = (X[i, :]*X[i, :]).sum()*self.weight_variance + self.bias_variance + 1.
|
||||
vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
|
||||
#denom = np.sqrt(np.outer(vec2,vec1))
|
||||
#arg = numer/denom
|
||||
gX = np.zeros(X2.shape)
|
||||
denom3 = denom*denom*denom
|
||||
gX = np.zeros((X2.shape[0], X2.shape[1]))
|
||||
for j in range(X2.shape[1]):
|
||||
gX[:, j] =X2[:, j]/denom - vec2*X[i, j]*numer/denom3
|
||||
gX[:, j] = four_over_tau*self.weight_variance*self.variance*gX[:, j]/np.sqrt(1-arg*arg)
|
||||
target[i, :]
|
||||
|
||||
|
||||
def _K_computations(self, X, X2):
|
||||
if self.ARD:
|
||||
pass
|
||||
else:
|
||||
if X2 is None:
|
||||
self._K_inner_prod = np.dot(X,X.T)
|
||||
self._K_numer = self._K_inner_prod*self.weight_variance+self.bias_variance
|
||||
vec = np.diag(self._K_numer) + 1.
|
||||
self._K_denom = np.sqrt(np.outer(vec,vec))
|
||||
self._K_asin_arg = self._K_numer/self._K_denom
|
||||
self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
|
||||
else:
|
||||
self._K_inner_prod = np.dot(X,X2.T)
|
||||
self._K_numer = self._K_inner_prod*self.weight_variance + self.bias_variance
|
||||
vec1 = (X*X).sum(1)*self.weight_variance + self.bias_variance + 1.
|
||||
vec2 = (X2*X2).sum(1)*self.weight_variance + self.bias_variance + 1.
|
||||
self._K_denom = np.sqrt(np.outer(vec1,vec2))
|
||||
self._K_asin_arg = self._K_numer/self._K_denom
|
||||
self._K_dvar = four_over_tau*np.arcsin(self._K_asin_arg)
|
||||
|
||||
def _K_diag_computations(self, X):
|
||||
if self.ARD:
|
||||
pass
|
||||
else:
|
||||
self._K_diag_numer = (X*X).sum(1)*self.weight_variance + self.bias_variance
|
||||
self._K_diag_denom = self._K_diag_numer+1.
|
||||
self._K_diag_dvar = four_over_tau*np.arcsin(self._K_diag_numer/self._K_diag_denom)
|
||||
Loading…
Add table
Add a link
Reference in a new issue