mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-06-08 15:05:15 +02:00
svgp inference added -- not working yet
This commit is contained in:
parent
35a33f94e8
commit
2de2217473
5 changed files with 311 additions and 1 deletions
|
|
@ -7,5 +7,6 @@ from parameterization.param import Param, ParamConcatenation
|
|||
from parameterization.observable_array import ObsAr
|
||||
|
||||
from gp import GP
|
||||
from svgp import SVGP
|
||||
from sparse_gp import SparseGP
|
||||
from mapping import *
|
||||
|
|
|
|||
90
GPy/core/svgp.py
Normal file
90
GPy/core/svgp.py
Normal file
|
|
@ -0,0 +1,90 @@
|
|||
# Copyright (c) 2014, James Hensman, Alex Matthews
|
||||
# Distributed under the terms of the GNU General public License, see LICENSE.txt
|
||||
|
||||
import numpy as np
|
||||
from ..util import choleskies
|
||||
from sparse_gp import SparseGP
|
||||
from parameterization.param import Param
|
||||
|
||||
class SVGP(SparseGP):
|
||||
def __init__(self, X, Y, Z, kernel, likelihood, name='SVGP', Y_metadata=None):
|
||||
"""
|
||||
Stochastic Variational GP.
|
||||
|
||||
For Gaussian Likelihoods, this implements
|
||||
|
||||
Gaussian Processes for Big data, Hensman, Fusi and Lawrence, UAI 2013,
|
||||
|
||||
But without natural gradients. We'll use the lower-triangluar
|
||||
representation of the covariance matrix to ensure
|
||||
positive-definiteness.
|
||||
|
||||
For Non Gaussian Likelihoods, this implements
|
||||
|
||||
Hensman, Matthews and Ghahramani, Scalable Variational GP Classification, ArXiv 1411.2005
|
||||
"""
|
||||
|
||||
#create the SVI inference method
|
||||
from ..inference.latent_function_inference import SVGP as svgp_inf
|
||||
inf_method = svgp_inf()
|
||||
|
||||
SparseGP.__init__(self,X, Y, Z, kernel, likelihood, inference_method=inf_method,
|
||||
name=name, Y_metadata=Y_metadata, normalizer=False)
|
||||
|
||||
#?? self.set_data(X, Y)
|
||||
|
||||
self.m = Param('q_u_mean', np.zeros(self.num_inducing))
|
||||
chol = choleskies.triang_to_flat(np.eye(self.num_inducing)[:,:,None])
|
||||
self.chol = Param('q_u_chol', chol.flatten())
|
||||
self.link_parameter(self.chol)
|
||||
self.link_parameter(self.m)
|
||||
|
||||
#self.batch_scale = 1. # how to rescale the batch likelihood in case of minibatches
|
||||
|
||||
|
||||
def parameters_changed(self):
|
||||
self.posterior, self._log_marginal_likelihood, self.grad_dict = self.inference_method.inference(self.q_u_mean, self.q_u_chol, self.kern, self.X, self.Z, self.likelihood, self.Y, self.Y_metadata)
|
||||
|
||||
#update the kernel gradients
|
||||
self.kern.update_gradients_full(self.grad_dict['dL_dKmm'], self.Z)
|
||||
grad = self.kern.gradient.copy()
|
||||
self.kern.update_gradients_full(self.grad_dict['dL_dKmn'], self.Z, self.X)
|
||||
grad += self.kern.gradient
|
||||
self.kern.update_gradients_diag(self.grad_dict['dL_dKdiag'], self.X)
|
||||
self.kern.gradient += grad
|
||||
if not self.Z.is_fixed:# only compute these expensive gradients if we need them
|
||||
self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) + self.kern.gradients_X(self.grad_dict['dL_dKmn'], self.Z, self.X)
|
||||
|
||||
|
||||
#update the variational parameter gradients:
|
||||
self.m.gradient = self.grad_dict['dL_dm']
|
||||
self.chol.gradient = self.grad_dict['dL_dchol']
|
||||
|
||||
|
||||
#def set_data(self, X, Y):
|
||||
#assert X.shape[1]==self.Z.shape[1]
|
||||
#self.X, self.Y = GPy.core.ObsAr(X), Y
|
||||
|
||||
def optimizeWithFreezingZ(self):
|
||||
self.Z.fix()
|
||||
self.kern.fix()
|
||||
self.optimize('bfgs')
|
||||
self.Z.unfix()
|
||||
self.kern.constrain_positive()
|
||||
self.optimize('bfgs')
|
||||
|
||||
#class SPGPC_stoch(SPGPC):
|
||||
#def __init__(self, X, Y, Z, kern=None, likelihood=None, batchsize=10):
|
||||
#SPGPC.__init__(self, X[:1], Y[:1], Z, kern, likelihood)
|
||||
#self.X_all, self.Y_all = X, Y
|
||||
#self.batchsize = batchsize
|
||||
#self.batch_scale = float(self.X_all.shape[0])/float(self.batchsize)
|
||||
#
|
||||
#def stochastic_grad(self, w):
|
||||
#i = np.random.permutation(self.X_all.shape[0])[:self.batchsize]
|
||||
#self.set_data(self.X_all[i], self.Y_all[i])
|
||||
#return self._grads(w)
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright (c) 2012, James Hensman
|
||||
# Copyright (c) 2012-2014, Max Zwiessele, James Hensman
|
||||
# Licensed under the BSD 3-clause license (see LICENSE.txt)
|
||||
|
||||
__doc__ = """
|
||||
|
|
@ -69,6 +69,7 @@ from expectation_propagation_dtc import EPDTC
|
|||
from dtc import DTC
|
||||
from fitc import FITC
|
||||
from var_dtc_parallel import VarDTC_minibatch
|
||||
from svgp import SVGP
|
||||
|
||||
# class FullLatentFunctionData(object):
|
||||
#
|
||||
|
|
|
|||
88
GPy/inference/latent_function_inference/svgp.py
Normal file
88
GPy/inference/latent_function_inference/svgp.py
Normal file
|
|
@ -0,0 +1,88 @@
|
|||
from . import LatentFunctionInference
|
||||
from ...util import linalg
|
||||
from ...util import choleskies
|
||||
import numpy as np
|
||||
from posterior import Posterior
|
||||
|
||||
class SVGP(LatentFunctionInference):
|
||||
def likelihood_quadrature(self, Y, m, v):
|
||||
Ysign = np.where(Y==1,1,-1).flatten()
|
||||
from scipy import stats
|
||||
self.gh_x, self.gh_w = np.polynomial.hermite.hermgauss(20)
|
||||
|
||||
#assume probit for now.
|
||||
X = self.gh_x[None,:]*np.sqrt(2.*v[:,None]) + (m*Ysign)[:,None]
|
||||
p = stats.norm.cdf(X)
|
||||
p = np.clip(p, 1e-9, 1.-1e-9) # for numerical stability
|
||||
N = stats.norm.pdf(X)
|
||||
F = np.log(p).dot(self.gh_w)
|
||||
NoverP = N/p
|
||||
dF_dm = (NoverP*Ysign[:,None]).dot(self.gh_w)
|
||||
dF_dv = -0.5*(NoverP**2 + NoverP*X).dot(self.gh_w)
|
||||
return F, dF_dm, dF_dv
|
||||
|
||||
|
||||
def inference(self, q_u_mean, q_u_chol, kern, X, Z, likelihood, Y, Y_metadata=None):
|
||||
assert Y.shape[1]==1, "multi outputs not implemented"
|
||||
|
||||
|
||||
num_inducing = Z.shape[0]
|
||||
#expand cholesky representation
|
||||
L = choleskies.flat_to_triang(q_u_chol[:,None]).squeeze()
|
||||
S = L.dot(L.T)
|
||||
Si,_ = linalg.dpotri(np.asfortranarray(L), lower=1)
|
||||
logdetS = 2.*np.sum(np.log(np.abs(np.diag(L))))
|
||||
|
||||
if np.any(np.isinf(Si)):
|
||||
print "warning:Cholesky representation unstable"
|
||||
S = S + np.eye(S.shape[0])*1e-5*np.max(np.max(S))
|
||||
Si, Lnew, _,_ = linalg.pdinv(S)
|
||||
|
||||
#compute kernel related stuff
|
||||
Kmm = kern.K(Z)
|
||||
Knm = kern.K(X, Z)
|
||||
Knn_diag = kern.Kdiag(X)
|
||||
Kmmi, Lm, Lmi, logdetKmm = linalg.pdinv(Kmm)
|
||||
|
||||
#compute the marginal means and variances of q(f)
|
||||
A = np.dot(Knm, Kmmi)
|
||||
mu = np.dot(A, q_u_mean)
|
||||
v = Knn_diag - np.sum(A*Knm,1) + np.sum(A*A.dot(S),1)
|
||||
|
||||
#compute the KL term
|
||||
Kmmim = np.dot(Kmmi, q_u_mean)
|
||||
KL = -0.5*logdetS -0.5*num_inducing + 0.5*logdetKmm + 0.5*np.sum(Kmmi*S) + 0.5*q_u_mean.dot(Kmmim)
|
||||
dKL_dm = Kmmim
|
||||
dKL_dS = 0.5*(Kmmi - Si)
|
||||
dKL_dKmm = 0.5*Kmmi - 0.5*Kmmi.dot(S).dot(Kmmi) - 0.5*Kmmim[:,None]*Kmmim[None,:]
|
||||
|
||||
#quadrature for the likelihood
|
||||
#F, dF_dmu, dF_dv = likelihood.variational_expectations(Y, mu, v)
|
||||
F, dF_dmu, dF_dv = self.likelihood_quadrature(Y, mu, v)
|
||||
|
||||
|
||||
#rescale the F term if working on a batch
|
||||
#F, dF_dmu, dF_dv = F*batch_scale, dF_dmu*batch_scale, dF_dv*batch_scale
|
||||
|
||||
#derivatives of quadratured likelihood
|
||||
Adv = A.T*dF_dv # As if dF_Dv is diagonal
|
||||
Admu = A.T.dot(dF_dmu)
|
||||
AdvA = np.dot(Adv,A)
|
||||
tmp = AdvA.dot(S).dot(Kmmi)
|
||||
dF_dKmm = -Admu[:,None].dot(Kmmim[None,:]) + AdvA - tmp - tmp.T
|
||||
dF_dKmm = 0.5*(dF_dKmm + dF_dKmm.T) # necessary? GPy bug?
|
||||
dF_dKmn = 2.*(Kmmi.dot(S) - np.eye(num_inducing)).dot(Adv) + Kmmim[:,None]*dF_dmu[None,:]
|
||||
dF_dm = Admu
|
||||
dF_dS = AdvA
|
||||
|
||||
#sum (gradients of) expected likelihood and KL part
|
||||
log_marginal = F.sum() - KL
|
||||
dL_dm, dL_dS, dL_dKmm, dL_dKmn = dF_dm - dKL_dm, dF_dS- dKL_dS, dF_dKmm- dKL_dKmm, dF_dKmn
|
||||
|
||||
dL_dchol = 2.*np.dot(dL_dS, L)
|
||||
dL_dchol = choleskies.triang_to_flat(dL_dchol[:,:,None]).squeeze()
|
||||
|
||||
return Posterior(mean=q_u_mean, cov=S, K=Kmm), log_marginal, {'dL_dKmm':dL_dKmm, 'dL_dKmn':dL_dKmn, 'dL_dKdiag': dF_dv, 'dL_dm':dL_dm, 'dL_dchol':dL_dchol}
|
||||
|
||||
|
||||
|
||||
130
GPy/util/choleskies.py
Normal file
130
GPy/util/choleskies.py
Normal file
|
|
@ -0,0 +1,130 @@
|
|||
# Copyright James Hensman and Max Zwiessele 2014
|
||||
# Licensed under the GNU GPL version 3.0
|
||||
|
||||
import numpy as np
|
||||
from scipy import weave
|
||||
import linalg
|
||||
|
||||
|
||||
def safe_root(N):
|
||||
i = np.sqrt(N)
|
||||
j = int(i)
|
||||
if i != j:
|
||||
raise ValueError, "N is not square!"
|
||||
return j
|
||||
|
||||
def flat_to_triang(flat):
|
||||
"""take a matrix N x D and return a M X M x D array where
|
||||
|
||||
N = M(M+1)/2
|
||||
|
||||
the lower triangluar portion of the d'th slice of the result is filled by the d'th column of flat.
|
||||
"""
|
||||
N, D = flat.shape
|
||||
M = (-1 + safe_root(8*N+1))/2
|
||||
ret = np.zeros((M, M, D))
|
||||
flat = np.ascontiguousarray(flat)
|
||||
|
||||
code = """
|
||||
int count = 0;
|
||||
for(int m=0; m<M; m++)
|
||||
{
|
||||
for(int mm=0; mm<=m; mm++)
|
||||
{
|
||||
for(int d=0; d<D; d++)
|
||||
{
|
||||
ret[d + m*D*M + mm*D] = flat[count];
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
weave.inline(code, ['flat', 'ret', 'D', 'M'])
|
||||
return ret
|
||||
|
||||
def triang_to_flat(L):
|
||||
M, _, D = L.shape
|
||||
|
||||
L = np.ascontiguousarray(L) # should do nothing if L was created by flat_to_triang
|
||||
|
||||
N = M*(M+1)/2
|
||||
flat = np.empty((N, D))
|
||||
code = """
|
||||
int count = 0;
|
||||
for(int m=0; m<M; m++)
|
||||
{
|
||||
for(int mm=0; mm<=m; mm++)
|
||||
{
|
||||
for(int d=0; d<D; d++)
|
||||
{
|
||||
flat[count] = L[d + m*D*M + mm*D];
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
"""
|
||||
weave.inline(code, ['flat', 'L', 'D', 'M'])
|
||||
return flat
|
||||
|
||||
def triang_to_cov(L):
|
||||
return np.dstack([np.dot(L[:,:,i], L[:,:,i].T) for i in xrange(L.shape[-1])])
|
||||
|
||||
def multiple_dpotri_old(Ls):
|
||||
M, _, D = Ls.shape
|
||||
Kis = np.rollaxis(Ls, -1).copy()
|
||||
[dpotri(Kis[i,:,:], overwrite_c=1, lower=1) for i in xrange(D)]
|
||||
code = """
|
||||
for(int d=0; d<D; d++)
|
||||
{
|
||||
for(int m=0; m<M; m++)
|
||||
{
|
||||
for(int mm=0; mm<m; mm++)
|
||||
{
|
||||
Kis[d*M*M + mm*M + m ] = Kis[d*M*M + m*M + mm];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
"""
|
||||
weave.inline(code, ['Kis', 'D', 'M'])
|
||||
Kis = np.rollaxis(Kis, 0, 3) #wtf rollaxis?
|
||||
return Kis
|
||||
|
||||
def multiple_dpotri(Ls):
|
||||
return np.dstack([linalg.dpotri(np.asfortranarray(Ls[:,:,i]), lower=1)[0] for i in range(Ls.shape[-1])])
|
||||
|
||||
|
||||
|
||||
|
||||
def indexes_to_fix_for_low_rank(rank, size):
|
||||
"""
|
||||
work out which indexes of the flatteneed array should be fixed if we want the cholesky to represent a low rank matrix
|
||||
"""
|
||||
#first we'll work out what to keep, and the do the set difference.
|
||||
|
||||
#here are the indexes of the first column, which are the triangular numbers
|
||||
n = np.arange(size)
|
||||
triangulars = (n**2 + n) / 2
|
||||
keep = []
|
||||
for i in range(rank):
|
||||
keep.append(triangulars[i:] + i)
|
||||
#add the diagonal
|
||||
keep.append(triangulars[1:]-1)
|
||||
keep.append((size**2 + size)/2 -1)# the very last element
|
||||
keep = np.hstack(keep)
|
||||
|
||||
return np.setdiff1d(np.arange((size**2+size)/2), keep)
|
||||
|
||||
|
||||
|
||||
#class cholchecker(GPy.core.Model):
|
||||
#def __init__(self, L, name='cholchecker'):
|
||||
#super(cholchecker, self).__init__(name)
|
||||
#self.L = GPy.core.Param('L',L)
|
||||
#self.link_parameter(self.L)
|
||||
#def parameters_changed(self):
|
||||
#LL = flat_to_triang(self.L)
|
||||
#Ki = multiple_dpotri(LL)
|
||||
#self.L.gradient = 2*np.einsum('ijk,jlk->ilk', Ki, LL)
|
||||
#self._loglik = np.sum([np.sum(np.log(np.abs(np.diag()))) for i in range(self.L.shape[-1])])
|
||||
#
|
||||
Loading…
Add table
Add a link
Reference in a new issue