mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-06-11 15:15:15 +02:00
Merge branch 'master' of github.com:SheffieldML/GPy
This commit is contained in:
commit
ba78e720f2
8 changed files with 208 additions and 47 deletions
142
GPy/inference/SCG.py
Normal file
142
GPy/inference/SCG.py
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
#Copyright I. Nabney, N.Lawrence and James Hensman (1996 - 2012)
|
||||
|
||||
#Scaled Conjuagte Gradients, originally in Matlab as part of the Netlab toolbox by I. Nabney, converted to python N. Lawrence and given a pythonic interface by James Hensman
|
||||
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT
|
||||
# HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
|
||||
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT
|
||||
# NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
# REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY
|
||||
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT
|
||||
# OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
# HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
|
||||
# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
# POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
import numpy as np
|
||||
|
||||
def SCG(f, gradf, x, optargs=(), maxiters=500, display=True, xtol=1e-6, ftol=1e-6):
|
||||
"""
|
||||
Optimisation through Scaled Conjugate Gradients (SCG)
|
||||
|
||||
f: the objective function
|
||||
gradf : the gradient function (should return a 1D np.ndarray)
|
||||
x : the initial condition
|
||||
|
||||
Returns
|
||||
x the optimal value for x
|
||||
flog : a list of all the objective values
|
||||
pointlog : a list of the x values tried
|
||||
scalelog : a list of the scales used in optimisation (beta)
|
||||
|
||||
"""
|
||||
|
||||
sigma0 = 1.0e-4
|
||||
fold = f(x, *optargs) # Initial function value.
|
||||
fnow = fold
|
||||
gradnew = gradf(x, *optargs) # Initial gradient.
|
||||
gradold = gradnew.copy()
|
||||
d = -gradnew # Initial search direction.
|
||||
success = True # Force calculation of directional derivs.
|
||||
nsuccess = 0 # nsuccess counts number of successes.
|
||||
beta = 1.0 # Initial scale parameter.
|
||||
betamin = 1.0e-15 # Lower bound on scale.
|
||||
betamax = 1.0e100 # Upper bound on scale.
|
||||
|
||||
flog = [fold]
|
||||
pointlog = [x.copy()]
|
||||
scalelog = [beta]
|
||||
|
||||
iteration = 0
|
||||
|
||||
# Main optimization loop.
|
||||
while iteration < maxiters:
|
||||
|
||||
# Calculate first and second directional derivatives.
|
||||
if success:
|
||||
mu = np.dot(d, gradnew)
|
||||
if mu >= 0:
|
||||
d = -gradnew
|
||||
mu = np.dot(d, gradnew)
|
||||
kappa = np.dot(d, d)
|
||||
#if kappa < eps():
|
||||
#return x, flog, pointlog, scalelog
|
||||
sigma = sigma0/np.sqrt(kappa)
|
||||
xplus = x + sigma*d
|
||||
gplus = gradf(xplus, *optargs)
|
||||
theta = np.dot(d, (gplus - gradnew))/sigma
|
||||
|
||||
# Increase effective curvature and evaluate step size alpha.
|
||||
delta = theta + beta*kappa
|
||||
if delta <= 0:
|
||||
delta = beta*kappa
|
||||
beta = beta - theta/kappa
|
||||
|
||||
alpha = - mu/delta
|
||||
|
||||
# Calculate the comparison ratio.
|
||||
xnew = x + alpha*d
|
||||
fnew = f(xnew, *optargs)
|
||||
Delta = 2.*(fnew - fold)/(alpha*mu)
|
||||
if Delta >= 0.:
|
||||
success = True
|
||||
nsuccess += 1
|
||||
x = xnew
|
||||
fnow = fnew
|
||||
else:
|
||||
success = False
|
||||
fnow = fold
|
||||
|
||||
# Store relevant variables
|
||||
flog.append(fnow) # Current function value
|
||||
pointlog.append(x) # Current position
|
||||
scalelog.append(beta) # current scale parameter
|
||||
|
||||
iteration += 1
|
||||
if display:
|
||||
print 'Iteration:', iteration, ' Objective:', fnow, ' Scale:', beta
|
||||
|
||||
if success:
|
||||
# Test for termination
|
||||
if np.max(np.abs(alpha*d)) < xtol or np.max(np.abs(fnew-fold)) < ftol:
|
||||
return x, flog, pointlog, scalelog
|
||||
|
||||
else:
|
||||
# Update variables for new position
|
||||
fold = fnew
|
||||
gradold = gradnew
|
||||
gradnew = gradf(x, *optargs)
|
||||
# If the gradient is zero then we are done.
|
||||
if np.dot(gradnew,gradnew) == 0:
|
||||
return x, flog, pointlog, scalelog
|
||||
|
||||
# Adjust beta according to comparison ratio.
|
||||
if Delta < 0.25:
|
||||
beta = min(4.0*beta, betamax)
|
||||
if Delta > 0.75:
|
||||
beta = max(0.5*beta, betamin)
|
||||
|
||||
# Update search direction using Polak-Ribiere formula, or re-start
|
||||
# in direction of negative gradient after nparams steps.
|
||||
if nsuccess == x.size:
|
||||
d = -gradnew
|
||||
nsuccess = 0
|
||||
elif success:
|
||||
gamma = np.dot(gradold - gradnew,gradnew)/(mu)
|
||||
d = gamma*d - gradnew
|
||||
|
||||
# If we get here, then we haven't terminated in the given number of
|
||||
# iterations.
|
||||
if display:
|
||||
print "maxiter exceeded"
|
||||
|
||||
return x, flog, pointlog, scalelog
|
||||
|
|
@ -196,6 +196,16 @@ class opt_rasm(Optimizer):
|
|||
|
||||
self.trace = opt_result[1]
|
||||
|
||||
class opt_scg(Optimizer):
|
||||
def __init__(self, *args, **kwargs):
|
||||
Optimizer.__init__(self, *args, **kwargs)
|
||||
self.opt_name = "Scaled Conjugate Gradients"
|
||||
|
||||
def opt(self, f_fp = None, f = None, fp = None):
|
||||
assert not f is None
|
||||
assert not fp is None
|
||||
opt_result = SCG (f,fp,self.x_init, display=self.messages,
|
||||
|
||||
def get_optimizer(f_min):
|
||||
# import rasmussens_minimize as rasm
|
||||
from SGD import opt_SGD
|
||||
|
|
|
|||
|
|
@ -73,8 +73,8 @@ class rational_quadratic(kernpart):
|
|||
if X2 is None: X2 = X
|
||||
dist2 = np.square((X-X2.T)/self.lengthscale)
|
||||
|
||||
dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 * (1 + dist2/2./self.power)**(-self.power-1)
|
||||
target += np.sum(dL_dK*dX)
|
||||
dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 * (1 + dist2/2./self.lengthscale)**(-self.power-1)
|
||||
target += np.sum(dL_dK*dX,1)[:,np.newaxis]
|
||||
|
||||
def dKdiag_dX(self,dL_dKdiag,X,target):
|
||||
pass
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@
|
|||
|
||||
import numpy as np
|
||||
import pylab as pb
|
||||
from ..util.linalg import mdot, jitchol, chol_inv, pdinv
|
||||
from ..util.linalg import mdot, jitchol, chol_inv, pdinv, trace_dot
|
||||
from ..util.plot import gpplot
|
||||
from .. import kern
|
||||
from GP import GP
|
||||
|
|
@ -80,7 +80,7 @@ class sparse_GP(GP):
|
|||
|
||||
#The rather complex computations of psi2_beta_scaled
|
||||
if self.likelihood.is_heteroscedastic:
|
||||
assert self.likelihood.D == 1 #TODO: what is the likelihood is heterscedatic and there are multiple independent outputs?
|
||||
assert self.likelihood.D == 1 #TODO: what if the likelihood is heterscedatic and there are multiple independent outputs?
|
||||
if self.has_uncertain_inputs:
|
||||
self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.flatten().reshape(self.N,1,1)/sf2)).sum(0)
|
||||
else:
|
||||
|
|
@ -102,17 +102,16 @@ class sparse_GP(GP):
|
|||
self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
|
||||
|
||||
self.psi1V = np.dot(self.psi1, self.V)
|
||||
self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
|
||||
tmp = np.dot(self.Lmi.T, self.LBi.T)
|
||||
self.C = np.dot(tmp,tmp.T)
|
||||
#self.C = mdot(self.Lmi.T, self.Bi, self.Lmi)
|
||||
#self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T)
|
||||
tmp = np.dot(self.C,self.psi1V/sf)
|
||||
self.E = np.dot(tmp,tmp.T)
|
||||
self.Cpsi1V = np.dot(self.C,self.psi1V)
|
||||
self.Cpsi1VVpsi1 = np.dot(self.Cpsi1V,self.psi1V.T)
|
||||
self.E = np.dot(self.Cpsi1VVpsi1,self.C)/sf2
|
||||
|
||||
# Compute dL_dpsi # FIXME: this is untested for the heterscedastic + uncertin inputs case
|
||||
self.dL_dpsi0 = - 0.5 * self.D * (self.likelihood.precision * np.ones([self.N,1])).flatten()
|
||||
self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
|
||||
#self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T
|
||||
self.dL_dpsi1 = np.dot(self.Cpsi1V,self.V.T)
|
||||
if self.likelihood.is_heteroscedastic:
|
||||
if self.has_uncertain_inputs:
|
||||
self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB
|
||||
|
|
@ -138,8 +137,9 @@ class sparse_GP(GP):
|
|||
|
||||
# Compute dL_dKmm
|
||||
self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB
|
||||
self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
|
||||
self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD
|
||||
#self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC
|
||||
#self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - self.Cpsi1VVpsi1, self.Kmmi) + 0.5*self.E # dD
|
||||
self.dL_dKmm += 0.5*(self.D*(self.C/sf2 -self.Kmmi) + self.E) + np.dot(np.dot(self.D*self.C + self.E*sf2,self.psi2_beta_scaled) - self.Cpsi1VVpsi1,self.Kmmi) # d(C+D)
|
||||
|
||||
#the partial derivative vector for the likelihood
|
||||
if self.likelihood.Nparams ==0:
|
||||
|
|
@ -156,8 +156,8 @@ class sparse_GP(GP):
|
|||
beta = self.likelihood.precision
|
||||
dbeta = 0.5 * self.N*self.D/beta - 0.5 * np.sum(np.square(self.likelihood.Y))
|
||||
dbeta += - 0.5 * self.D * (self.psi0.sum() - np.trace(self.A)/beta*sf2)
|
||||
dbeta += - 0.5 * self.D * np.sum(self.Bi*self.A)/beta
|
||||
dbeta += np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/beta
|
||||
dbeta += - 0.5 * self.D * trace_dot(self.Bi,self.A)/beta
|
||||
dbeta += np.trace(self.Cpsi1VVpsi1)/beta - 0.5 * trace_dot(np.dot(self.C,self.psi2_beta_scaled) , self.Cpsi1VVpsi1 )/beta
|
||||
self.partial_for_likelihood = -dbeta*self.likelihood.precision**2
|
||||
|
||||
|
||||
|
|
@ -198,7 +198,7 @@ class sparse_GP(GP):
|
|||
A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) -0.5*self.likelihood.precision*self.likelihood.trYYT
|
||||
B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2)
|
||||
C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2))
|
||||
D = +0.5*np.sum(self.psi1VVpsi1 * self.C)
|
||||
D = 0.5*np.trace(self.Cpsi1VVpsi1)
|
||||
return A+B+C+D
|
||||
|
||||
def _log_likelihood_gradients(self):
|
||||
|
|
|
|||
|
|
@ -14,6 +14,13 @@ import types
|
|||
#import scipy.lib.lapack.flapack
|
||||
import scipy as sp
|
||||
|
||||
def trace_dot(a,b):
|
||||
"""
|
||||
efficiently compute the trace of the matrix product of a and b
|
||||
"""
|
||||
assert a.shape==b.T.shape
|
||||
return np.dot(a.flatten(),b.T.flatten())
|
||||
|
||||
def mdot(*args):
|
||||
"""Multiply all the arguments using matrix product rules.
|
||||
The output is equivalent to multiplying the arguments one by one
|
||||
|
|
|
|||
|
|
@ -5,35 +5,37 @@ List of implemented kernels
|
|||
|
||||
The following table shows the implemented kernels in GPy and gives the details of the implemented function for each kernel.
|
||||
|
||||
==================== =========== ====== ======= =========== =============== ======= =========== ====== ====== =======
|
||||
NAME get/set K Kdiag dK_dtheta dKdiag_dtheta dK_dX dKdiag_dX psi0 psi1 psi2
|
||||
==================== =========== ====== ======= =========== =============== ======= =========== ====== ====== =======
|
||||
bias |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
Brownian |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
exponential |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
finite_dimensional |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
linear |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
Matern32 |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
Matern52 |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
periodic_exponential |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
periodic_Matern32 |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
periodic_Matern52 |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
rbf |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
spline |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
white |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
==================== =========== ====== ======= =========== =============== ======= =========== ====== ====== =======
|
||||
==================== =========== ===== =========== ====== ======= =========== =============== ======= =========== ====== ====== =======
|
||||
NAME Dimension ARD get/set K Kdiag dK_dtheta dKdiag_dtheta dK_dX dKdiag_dX psi0 psi1 psi2
|
||||
==================== =========== ===== =========== ====== ======= =========== =============== ======= =========== ====== ====== =======
|
||||
bias n |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
Brownian 1 |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
exponential n yes |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
finite_dimensional n |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
linear n yes |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
Matern32 n yes |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
Matern52 n yes |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
periodic_exponential 1 |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
periodic_Matern32 1 |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
periodic_Matern52 1 |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
rational quadratic 1 |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
rbf n yes |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
spline 1 |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
-------------------- ----------- ----- ----------- ------ ------- ----------- --------------- ------- ----------- ------ ------ -------
|
||||
white n |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick| |tick|
|
||||
==================== =========== ===== =========== ====== ======= =========== =============== ======= =========== ====== ====== =======
|
||||
|
||||
Depending on the use, all functions may not be required
|
||||
|
||||
|
|
|
|||
|
|
@ -136,8 +136,8 @@ Computes the derivative of the likelihood with respect to the inputs ``X`` (a :m
|
|||
if X2 is None: X2 = X
|
||||
dist2 = np.square((X-X2.T)/self.lengthscale)
|
||||
|
||||
dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 * (1 + dist2/2./self.power)**(-self.power-1)
|
||||
target += np.sum(dL_dK*dX)
|
||||
dX = -self.variance*self.power * (X-X2.T)/self.lengthscale**2 * (1 + dist2/2./self.lengthscale)**(-self.power-1)
|
||||
target += np.sum(dL_dK*dX,1)[:,np.newaxis]
|
||||
|
||||
**dKdiag_dX(self,dL_dKdiag,X,target)**
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ return::
|
|||
Implemented kernels
|
||||
===================
|
||||
|
||||
Many kernels are already implemented in GPy. A comprehensive list can be found `here <kernel_implementation.html>`_ and the following figure gives a summary of most of them:
|
||||
Many kernels are already implemented in GPy. The following figure gives a summary of most of them (a comprehensive list can be list can be found `here <kernel_implementation.html>`_):
|
||||
|
||||
.. figure:: Figures/tuto_kern_overview_allkern.png
|
||||
:align: center
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue