From b6ffb57263a4175868449ad72c13e57226446f2e Mon Sep 17 00:00:00 2001 From: Ricardo Date: Fri, 25 Jan 2013 09:30:31 +0000 Subject: [PATCH 01/44] Fixing GP_EP --- GPy/examples/classification.py | 3 +- GPy/examples/ep_fix.py | 39 +++ GPy/inference/EP.py | 314 +++++++++++++++++++++++ GPy/inference/Expectation_Propagation.py | 2 +- GPy/inference/likelihoods.py | 2 +- GPy/models/GP_EP.py | 2 +- GPy/models/GP_EP2.py | 280 ++++++++++++++++++++ GPy/models/__init__.py | 1 + 8 files changed, 638 insertions(+), 5 deletions(-) create mode 100644 GPy/examples/ep_fix.py create mode 100644 GPy/inference/EP.py create mode 100644 GPy/models/GP_EP2.py diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index 989ed08a..fb14139d 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -76,11 +76,10 @@ def toy_linear_1d_classification(model_type='Full', inducing=4, seed=default_see # create simple GP model if model_type=='Full': - m = GPy.models.simple_GP_EP(data['X'],likelihood) + m = GPy.models.GP_EP(data['X'],likelihood) else: # create sparse GP EP model m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type) - m.constrain_positive('var') m.constrain_positive('len') diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py new file mode 100644 index 00000000..e4999f30 --- /dev/null +++ b/GPy/examples/ep_fix.py @@ -0,0 +1,39 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +""" +Simple Gaussian Processes classification +""" +import pylab as pb +import numpy as np +import GPy +pb.ion() + +default_seed=10000 + +model_type='Full' +inducing=4 +seed=default_seed +"""Simple 1D classification example. +:param model_type: type of model to fit ['Full', 'FITC', 'DTC']. +:param seed : seed value for data generation (default is 4). +:type seed: int +:param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). +:type inducing: int +""" +data = GPy.util.datasets.toy_linear_1d_classification(seed=seed) +likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) + +m = GPy.models.GP_EP2(data['X'],likelihood) + +#m.constrain_positive('var') +#m.constrain_positive('len') +#m.tie_param('lengthscale') +m.approximate_likelihood() +# Optimize and plot +#m.optimize() +#m.em(plot_all=False) # EM algorithm +m.plot() + +print(m) diff --git a/GPy/inference/EP.py b/GPy/inference/EP.py new file mode 100644 index 00000000..fa691961 --- /dev/null +++ b/GPy/inference/EP.py @@ -0,0 +1,314 @@ +import numpy as np +import random +import pylab as pb #TODO erase me +from scipy import stats, linalg +from .likelihoods import likelihood +from ..core import model +from ..util.linalg import pdinv,mdot,jitchol +from ..util.plot import gpplot +from .. import kern + +class EP: + def __init__(self,covariance,likelihood,Kmn=None,Knn_diag=None,epsilon=1e-3,powerep=[1.,1.]): + """ + Expectation Propagation + + Arguments + --------- + X : input observations + likelihood : Output's likelihood (likelihood class) + kernel : a GPy kernel (kern class) + inducing : Either an array specifying the inducing points location or a sacalar defining their number. None value for using a non-sparse model is used. + powerep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats) + epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) + """ + self.likelihood = likelihood + assert covariance.shape[0] == covariance.shape[1] + if Kmn is not None: + self.Kmm = covariance + self.Kmn = Kmn + self.M = self.Kmn.shape[0] + self.N = self.Kmn.shape[1] + assert self.M < self.N, 'The number of inducing inputs must be smaller than the number of observations' + else: + self.K = covariance + self.N = self.K.shape[0] + if Knn_diag is not None: + self.Knn_diag = Knn_diag + assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N' + + self.epsilon = epsilon + self.eta, self.delta = powerep + self.jitter = 1e-12 + + """ + Initial values - Likelihood approximation parameters: + p(y|f) = t(f|tau_tilde,v_tilde) + """ + self.tau_tilde = np.zeros(self.N) + self.v_tilde = np.zeros(self.N) + + def restart_EP(self): + """ + Set the EP approximation to initial state + """ + self.tau_tilde = np.zeros(self.N) + self.v_tilde = np.zeros(self.N) + self.mu = np.zeros(self.N) + +class Full(EP): + def fit_EP(self): + """ + The expectation-propagation algorithm. + For nomenclature see Rasmussen & Williams 2006 (pag. 52-60) + """ + #Prior distribution parameters: p(f|X) = N(f|0,K) + #self.K = self.kernel.K(self.X,self.X) + + #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) + self.mu=np.zeros(self.N) + self.Sigma=self.K.copy() + + """ + Initial values - Cavity distribution parameters: + q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + self.tau_ = np.empty(self.N,dtype=float) + self.v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + self.Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + self.mu_hat = mu_hat #TODO erase me + self.sigma2_hat = sigma2_hat #TODO erase me + + #Approximation + epsilon_np1 = self.epsilon + 1. + epsilon_np2 = self.epsilon + 1. + self.iterations = 0 + self.np1 = [self.tau_tilde.copy()] + self.np2 = [self.v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.arange(self.N) + #random.shuffle(update_order) #TODO uncomment + for i in update_order: + #Cavity distribution parameters + self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i] + self.v_[i] = self.mu[i]/self.Sigma[i,i] - self.eta*self.v_tilde[i] + #Marginal moments + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) + self.mu_hat[i] = mu_hat[i] #TODO erase me + self.sigma2_hat[i] = sigma2_hat[i] #TODO erase me + #if i == 3: + # a = b + #Site parameters update + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma[i,i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma[i,i]) + print Delta_tau + self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau + self.v_tilde[i] = self.v_tilde[i] + Delta_v + #Posterior distribution parameters update + si=self.Sigma[:,i].reshape(self.N,1) + self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T) + self.mu = np.dot(self.Sigma,self.v_tilde) + self.iterations += 1 + #Sigma recomptutation with Cholesky decompositon + Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*(self.K) + B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K + L = jitchol(B) + V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1) + self.Sigma = self.K - np.dot(V.T,V) + self.mu = np.dot(self.Sigma,self.v_tilde) + epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N + epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N + self.np1.append(self.tau_tilde.copy()) + self.np2.append(self.v_tilde.copy()) + +class DTC(EP): + def fit_EP(self): + """ + The expectation-propagation algorithm with sparse pseudo-input. + For nomenclature see ... 2013. + """ + + """ + Prior approximation parameters: + q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) + Sigma0 = Qnn = Knm*Kmmi*Kmn + """ + self.Kmmi, self.Kmm_hld = pdinv(self.Kmm) + self.KmnKnm = np.dot(self.Kmn, self.Kmn.T) + self.KmmiKmn = np.dot(self.Kmmi,self.Kmn) + self.Qnn_diag = np.sum(self.Kmn*self.KmmiKmn,-2) + self.LLT0 = self.Kmm.copy() + + """ + Posterior approximation: q(f|y) = N(f| mu, Sigma) + Sigma = Diag + P*R.T*R*P.T + K + mu = w + P*gamma + """ + self.mu = np.zeros(self.N) + self.LLT = self.Kmm.copy() + self.Sigma_diag = self.Qnn_diag.copy() + + """ + Initial values - Cavity distribution parameters: + q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + self.tau_ = np.empty(self.N,dtype=float) + self.v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + self.Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + + #Approximation + epsilon_np1 = 1 + epsilon_np2 = 1 + self.iterations = 0 + self.np1 = [self.tau_tilde.copy()] + self.np2 = [self.v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.arange(self.N) + random.shuffle(update_order) + for i in update_order: + #Cavity distribution parameters + self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i] + self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i] + #Marginal moments + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) + #Site parameters update + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i]) + self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau + self.v_tilde[i] = self.v_tilde[i] + Delta_v + #Posterior distribution parameters update + self.LLT = self.LLT + np.outer(self.Kmn[:,i],self.Kmn[:,i])*Delta_tau + L = jitchol(self.LLT) + V,info = linalg.flapack.dtrtrs(L,self.Kmn,lower=1) + self.Sigma_diag = np.sum(V*V,-2) + si = np.sum(V.T*V[:,i],-1) + self.mu = self.mu + (Delta_v-Delta_tau*self.mu[i])*si + self.iterations += 1 + #Sigma recomputation with Cholesky decompositon + self.LLT0 = self.LLT0 + np.dot(self.Kmn*self.tau_tilde[None,:],self.Kmn.T) + self.L = jitchol(self.LLT) + V,info = linalg.flapack.dtrtrs(L,self.Kmn,lower=1) + V2,info = linalg.flapack.dtrtrs(L.T,V,lower=0) + self.Sigma_diag = np.sum(V*V,-2) + Knmv_tilde = np.dot(self.Kmn,self.v_tilde) + self.mu = np.dot(V2.T,Knmv_tilde) + epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N + epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N + self.np1.append(self.tau_tilde.copy()) + self.np2.append(self.v_tilde.copy()) + +class FITC(EP): + def fit_EP(self): + """ + The expectation-propagation algorithm with sparse pseudo-input. + For nomenclature see Naish-Guzman and Holden, 2008. + """ + + """ + Prior approximation parameters: + q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) + Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn + """ + self.Kmmi, self.Kmm_hld = pdinv(self.Kmm) + self.P0 = self.Kmn.T + self.KmnKnm = np.dot(self.P0.T, self.P0) + self.KmmiKmn = np.dot(self.Kmmi,self.P0.T) + self.Qnn_diag = np.sum(self.P0.T*self.KmmiKmn,-2) + self.Diag0 = self.Knn_diag - self.Qnn_diag + self.R0 = jitchol(self.Kmmi).T + + """ + Posterior approximation: q(f|y) = N(f| mu, Sigma) + Sigma = Diag + P*R.T*R*P.T + K + mu = w + P*gamma + """ + self.w = np.zeros(self.N) + self.gamma = np.zeros(self.M) + self.mu = np.zeros(self.N) + self.P = self.P0.copy() + self.R = self.R0.copy() + self.Diag = self.Diag0.copy() + self.Sigma_diag = self.Knn_diag + + """ + Initial values - Cavity distribution parameters: + q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + self.tau_ = np.empty(self.N,dtype=float) + self.v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + self.Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + + #Approximation + epsilon_np1 = 1 + epsilon_np2 = 1 + self.iterations = 0 + self.np1 = [self.tau_tilde.copy()] + self.np2 = [self.v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.arange(self.N) + random.shuffle(update_order) + for i in update_order: + #Cavity distribution parameters + self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i] + self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i] + #Marginal moments + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) + #Site parameters update + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i]) + self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau + self.v_tilde[i] = self.v_tilde[i] + Delta_v + #Posterior distribution parameters update + dtd1 = Delta_tau*self.Diag[i] + 1. + dii = self.Diag[i] + self.Diag[i] = dii - (Delta_tau * dii**2.)/dtd1 + pi_ = self.P[i,:].reshape(1,self.M) + self.P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_ + Rp_i = np.dot(self.R,pi_.T) + RTR = np.dot(self.R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*self.Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),self.R)) + self.R = jitchol(RTR).T + self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1 + self.gamma = self.gamma + (Delta_v - Delta_tau*self.mu[i])*np.dot(RTR,self.P[i,:].T) + self.RPT = np.dot(self.R,self.P.T) + self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1) + self.mu = self.w + np.dot(self.P,self.gamma) + self.iterations += 1 + #Sigma recomptutation with Cholesky decompositon + self.Diag = self.Diag0/(1.+ self.Diag0 * self.tau_tilde) + self.P = (self.Diag / self.Diag0)[:,None] * self.P0 + self.RPT0 = np.dot(self.R0,self.P0.T) + L = jitchol(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T)) + self.R,info = linalg.flapack.dtrtrs(L,self.R0,lower=1) + self.RPT = np.dot(self.R,self.P.T) + self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1) + self.w = self.Diag * self.v_tilde + self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.v_tilde)) + self.mu = self.w + np.dot(self.P,self.gamma) + epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N + epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N + self.np1.append(self.tau_tilde.copy()) + self.np2.append(self.v_tilde.copy()) diff --git a/GPy/inference/Expectation_Propagation.py b/GPy/inference/Expectation_Propagation.py index 05453f1d..520fc607 100644 --- a/GPy/inference/Expectation_Propagation.py +++ b/GPy/inference/Expectation_Propagation.py @@ -116,7 +116,7 @@ class Full(EP_base): self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) if messages: - print "EP iteration %i, epsiolon %d"%(self.iterations,epsilon_np1) + print "EP iteration %i, epsilon %d"%(self.iterations,epsilon_np1) class FITC(EP_base): """ diff --git a/GPy/inference/likelihoods.py b/GPy/inference/likelihoods.py index 5f0eb7ff..ff4770f6 100644 --- a/GPy/inference/likelihoods.py +++ b/GPy/inference/likelihoods.py @@ -32,7 +32,7 @@ class likelihood: """ assert X_new.shape[1] == 1, 'Number of dimensions must be 1' gpplot(X_new,Mean_new,Var_new) - pb.errorbar(X_u,Mean_u,2*np.sqrt(Var_u),fmt='r+') + pb.errorbar(X_u.flatten(),Mean_u.flatten(),2*np.sqrt(Var_u.flatten()),fmt='r+') pb.plot(X_u,Mean_u,'ro') def plot2D(self,X,X_new,F_new,U=None): diff --git a/GPy/models/GP_EP.py b/GPy/models/GP_EP.py index 51d69d0a..302ff366 100644 --- a/GPy/models/GP_EP.py +++ b/GPy/models/GP_EP.py @@ -57,7 +57,7 @@ class GP_EP(model): def posterior_param(self): self.K = self.kernel.K(self.X) self.Sroot_tilde_K = np.sqrt(self.ep_approx.tau_tilde)[:,None]*self.K - B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)[None,:]*self.Sroot_tilde_K + B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)*self.Sroot_tilde_K #self.L = np.linalg.cholesky(B) self.L = jitchol(B) V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1) diff --git a/GPy/models/GP_EP2.py b/GPy/models/GP_EP2.py new file mode 100644 index 00000000..c68e7b70 --- /dev/null +++ b/GPy/models/GP_EP2.py @@ -0,0 +1,280 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from scipy import stats, linalg +from .. import kern +from ..inference.EP import Full +from ..inference.likelihoods import likelihood,probit,poisson,gaussian +from ..core import model +from ..util.linalg import pdinv,mdot #,jitchol +from ..util.plot import gpplot, Tango + +class GP_EP2(model): + def __init__(self,X,likelihood,kernel=None,normalize_X=False,Xslices=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]): + """ + Simple Gaussian Process with Non-Gaussian likelihood + + Arguments + --------- + :param X: input observations (NxD numpy.darray) + :param likelihood: a GPy likelihood (likelihood class) + :param kernel: a GPy kernel, defaults to rbf+white + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 1e-3 + :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list) + :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) + :rtype: model object. + """ + #.. Note:: Multiple independent outputs are allowed using columns of Y #TODO add this note? + if kernel is None: + kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) + + # parse arguments + self.Xslices = Xslices + assert isinstance(kernel, kern.kern) + self.likelihood = likelihood + #self.Y = self.likelihood.Y #we might not need this + self.kern = kernel + self.X = X + assert len(self.X.shape)==2 + #assert len(self.Y.shape)==2 + #assert self.X.shape[0] == self.Y.shape[0] + #self.N, self.D = self.Y.shape + self.D = 1 + self.N, self.Q = self.X.shape + + #here's some simple normalisation + if normalize_X: + self._Xmean = X.mean(0)[None,:] + self._Xstd = X.std(0)[None,:] + self.X = (X.copy() - self._Xmean) / self._Xstd + if hasattr(self,'Z'): + self.Z = (self.Z - self._Xmean) / self._Xstd + else: + self._Xmean = np.zeros((1,self.X.shape[1])) + self._Xstd = np.ones((1,self.X.shape[1])) + + #THIS PART IS NOT NEEDED + """ + if normalize_Y: + self._Ymean = Y.mean(0)[None,:] + self._Ystd = Y.std(0)[None,:] + self.Y = (Y.copy()- self._Ymean) / self._Ystd + else: + self._Ymean = np.zeros((1,self.Y.shape[1])) + self._Ystd = np.ones((1,self.Y.shape[1])) + + if self.D > self.N: + # then it's more efficient to store YYT + self.YYT = np.dot(self.Y, self.Y.T) + else: + self.YYT = None + """ + self.eta,self.delta = powerep + self.epsilon_ep = epsilon_ep + self.tau_tilde = np.zeros([self.N,self.D]) + self.v_tilde = np.zeros([self.N,self.D]) + model.__init__(self) + + def _set_params(self,p): + self.kern._set_params_transformed(p) + self.K = self.kern.K(self.X,slices1=self.Xslices) + self.posterior_params() + + def _get_params(self): + return self.kern._get_params_transformed() + + def _get_param_names(self): + return self.kern._get_param_names_transformed() + + def approximate_likelihood(self): + self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) + self.ep_approx.fit_EP() + self.tau_tilde = self.ep_approx.tau_tilde[:,None] + self.v_tilde = self.ep_approx.tau_tilde[:,None] + self.posterior_params() + self.Y = self.v_tilde/self.tau_tilde + self._Ymean = np.zeros((1,self.Y.shape[1])) + self._Ystd = np.ones((1,self.Y.shape[1])) + #self.YYT = np.dot(self.Y, self.Y.T) + + def posterior_params(self): + self.Sroot_tilde_K = np.sqrt(self.tau_tilde.flatten())[:,None]*self.K + B = np.eye(self.N) + np.sqrt(self.tau_tilde.flatten())[None,:]*self.Sroot_tilde_K + self.Bi,self.L,self.Li,B_logdet = pdinv(B) + V = np.dot(self.Li,self.Sroot_tilde_K) + #V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1) + self.Sigma = self.K - np.dot(V.T,V) + self.mu = np.dot(self.Sigma,self.v_tilde.flatten()) + + + #def _model_fit_term(self): + # """ + # Computes the model fit using YYT if it's available + # """ + # if self.YYT is None: + # return -0.5*np.sum(np.square(np.dot(self.Li,self.Y))) + # else: + # return -0.5*np.sum(np.multiply(self.Ki, self.YYT)) + + def log_likelihood(self): + mu_ = self.ep_approx.v_/self.ep_approx.tau_ + L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L))) + L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:])) + L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde) + L3 = sum(np.log(self.ep_approx.Z_hat)) + return L1 + L2A + L2B + L3 + + def dL_dK(self): #FIXME + if self.YYT is None: + alpha = np.dot(self.Ki,self.Y) + dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) + else: + dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki) + + return dL_dK + + def _log_likelihood_gradients(self): #FIXME + return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X) + + def predict(self,Xnew, slices=None, full_cov=False): + """ + + Predict the function(s) at the new point(s) Xnew. + + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.Q + :param slices: specifies which outputs kernel(s) the Xnew correspond to (see below) + :type slices: (None, list of slice objects, list of ints) + :param full_cov: whether to return the folll covariance matrix, or just the diagonal + :type full_cov: bool + :rtype: posterior mean, a Numpy array, Nnew x self.D + :rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D) + + .. Note:: "slices" specifies how the the points X_new co-vary wich the training points. + + - If None, the new points covary throigh every kernel part (default) + - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part + - If a list of booleans, specifying which kernel parts are active + + If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. + This is to allow for different normalisations of the output dimensions. + + + """ + + #normalise X values + Xnew = (Xnew.copy() - self._Xmean) / self._Xstd + mu, var, phi = self._raw_predict(Xnew, slices, full_cov) + + #un-normalise + mu = mu*self._Ystd + self._Ymean + if full_cov: + if self.D==1: + var *= np.square(self._Ystd) + else: + var = var[:,:,None] * np.square(self._Ystd) + else: + if self.D==1: + var *= np.square(np.squeeze(self._Ystd)) + else: + var = var[:,None] * np.square(self._Ystd) + + return mu,var,phi + + def _raw_predict(self,_Xnew,slices, full_cov=False): + """Internal helper function for making predictions, does not account for normalisation""" + """ + Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) + mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) + KiKx = np.dot(self.Ki,Kx) + if full_cov: + Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) + var = Kxx - np.dot(KiKx.T,Kx) + else: + Kxx = self.kern.Kdiag(_Xnew, slices=slices) + var = Kxx - np.sum(np.multiply(KiKx,Kx),0) + return mu, var + """ + K_x = self.kern.K(self.X,_Xnew) + Kxx = self.kern.K(_Xnew) + #aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1) + #aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0) + #aux2 = mdot(self.Li.T,self.Li,self.Sroot_tilde_K,self.ep_approx.v_tilde) + aux2 = mdot(self.Bi,self.Sroot_tilde_K,self.ep_approx.v_tilde) + zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2 + f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta) + #v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1) + v = mdot(self.Li,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x) + variance = Kxx - np.dot(v.T,v) + vdiag = np.diag(variance) + y=self.likelihood.predictive_mean(f,vdiag) + return f,vdiag,y + + def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): + """ + :param samples: the number of a posteriori samples to plot + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :param which_functions: which of the kernel functions to plot (additively) + :type which_functions: list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions using which_data and which_functions + """ + if which_functions=='all': + which_functions = [True]*self.kern.Nparts + if which_data=='all': + which_data = slice(None) + + X = self.X[which_data,:] + Y = self.Y[which_data,:] + + Xorig = X*self._Xstd + self._Xmean + Yorig = Y*self._Ystd + self._Ymean + if plot_limits is None: + xmin,xmax = Xorig.min(0),Xorig.max(0) + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + elif len(plot_limits)==2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + + if self.X.shape[1]==1: + Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] + #m,v,phi = self.predict(Xnew,slices=which_functions) + #gpplot(Xnew,m,v) + mu_f, var_f, phi_f = self.predict(Xnew,slices=which_functions) + pb.subplot(211) + self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma)) + if samples: + s = np.random.multivariate_normal(m.flatten(),v,samples) + pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) + pb.xlim(xmin,xmax) + pb.subplot(212) + self.likelihood.plot1Db(self.X,Xnew,phi_f) + + elif self.X.shape[1]==2: + resolution = 50 or resolution + xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] + Xtest = np.vstack((xx.flatten(),yy.flatten())).T + zz,vv = self.predict(Xtest,slices=which_functions) + zz = zz.reshape(resolution,resolution) + pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) + pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) + pb.xlim(xmin[0],xmax[0]) + pb.ylim(xmin[1],xmax[1]) + + else: + raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index ab7ff5b4..5f824f2b 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -7,6 +7,7 @@ from sparse_GP_regression import sparse_GP_regression from GPLVM import GPLVM from warped_GP import warpedGP from GP_EP import GP_EP +from GP_EP2 import GP_EP2 from generalized_FITC import generalized_FITC from sparse_GPLVM import sparse_GPLVM from uncollapsed_sparse_GP import uncollapsed_sparse_GP From 6a2e0a1fe554dfe00036b6fdef82c9d437bff3f0 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Fri, 25 Jan 2013 18:14:28 +0000 Subject: [PATCH 02/44] fixing EP and merging it with GP_regression --- GPy/examples/ep_fix.py | 11 +- GPy/inference/EP.py | 12 +- GPy/inference/likelihoods.py | 31 ++-- GPy/models/GP.py | 312 +++++++++++++++++++++++++++++++++++ GPy/models/GP_EP.py | 2 +- GPy/models/GP_EP2.py | 127 +++++++------- GPy/models/__init__.py | 1 + 7 files changed, 403 insertions(+), 93 deletions(-) create mode 100644 GPy/models/GP.py diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index e4999f30..2da94335 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -25,14 +25,15 @@ seed=default_seed data = GPy.util.datasets.toy_linear_1d_classification(seed=seed) likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) -m = GPy.models.GP_EP2(data['X'],likelihood) +m = GPy.models.GP(data['X'],likelihood=likelihood) -#m.constrain_positive('var') -#m.constrain_positive('len') -#m.tie_param('lengthscale') +m.constrain_positive('var') +m.constrain_positive('len') +m.tie_param('lengthscale') m.approximate_likelihood() +print m.checkgrad() # Optimize and plot -#m.optimize() +m.optimize() #m.em(plot_all=False) # EM algorithm m.plot() diff --git a/GPy/inference/EP.py b/GPy/inference/EP.py index fa691961..f7c163b1 100644 --- a/GPy/inference/EP.py +++ b/GPy/inference/EP.py @@ -60,7 +60,7 @@ class Full(EP): def fit_EP(self): """ The expectation-propagation algorithm. - For nomenclature see Rasmussen & Williams 2006 (pag. 52-60) + For nomenclature see Rasmussen & Williams 2006. """ #Prior distribution parameters: p(f|X) = N(f|0,K) #self.K = self.kernel.K(self.X,self.X) @@ -84,8 +84,6 @@ class Full(EP): phi = np.empty(self.N,dtype=float) mu_hat = np.empty(self.N,dtype=float) sigma2_hat = np.empty(self.N,dtype=float) - self.mu_hat = mu_hat #TODO erase me - self.sigma2_hat = sigma2_hat #TODO erase me #Approximation epsilon_np1 = self.epsilon + 1. @@ -95,21 +93,16 @@ class Full(EP): self.np2 = [self.v_tilde.copy()] while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: update_order = np.arange(self.N) - #random.shuffle(update_order) #TODO uncomment + random.shuffle(update_order) for i in update_order: #Cavity distribution parameters self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i] self.v_[i] = self.mu[i]/self.Sigma[i,i] - self.eta*self.v_tilde[i] #Marginal moments self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) - self.mu_hat[i] = mu_hat[i] #TODO erase me - self.sigma2_hat[i] = sigma2_hat[i] #TODO erase me - #if i == 3: - # a = b #Site parameters update Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma[i,i]) Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma[i,i]) - print Delta_tau self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau self.v_tilde[i] = self.v_tilde[i] + Delta_v #Posterior distribution parameters update @@ -128,6 +121,7 @@ class Full(EP): epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) + return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None] class DTC(EP): def fit_EP(self): diff --git a/GPy/inference/likelihoods.py b/GPy/inference/likelihoods.py index ff4770f6..29e194e0 100644 --- a/GPy/inference/likelihoods.py +++ b/GPy/inference/likelihoods.py @@ -19,7 +19,7 @@ class likelihood: self.Y = Y self.N = self.Y.shape[0] - def plot1Da(self,X_new,Mean_new,Var_new,X_u,Mean_u,Var_u): + def plot1Da(self,X,mean,var,Z=None,mean_Z=None,var_Z=None): """ Plot the predictive distribution of the GP model for 1-dimensional inputs @@ -30,10 +30,18 @@ class likelihood: :param Mean_u: mean values at X_u :param Var_new: variance values at X_u """ - assert X_new.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X_new,Mean_new,Var_new) - pb.errorbar(X_u.flatten(),Mean_u.flatten(),2*np.sqrt(Var_u.flatten()),fmt='r+') - pb.plot(X_u,Mean_u,'ro') + assert X.shape[1] == 1, 'Number of dimensions must be 1' + gpplot(X,mean,var.flatten()) + pb.errorbar(Z.flatten(),mean_Z.flatten(),2*np.sqrt(var_Z.flatten()),fmt='r+') + pb.plot(Z,mean_Z,'ro') + + def plot1Db(self,X_obs,X,phi,Z=None): + assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' + gpplot(X,phi,np.zeros(X.shape[0])) + pb.plot(X_obs,(self.Y+1)/2,'kx',mew=1.5) + pb.ylim(-0.2,1.2) + if Z is not None: + pb.plot(Z,Z*0+.5,'r|',mew=1.5,markersize=12) def plot2D(self,X,X_new,F_new,U=None): """ @@ -88,16 +96,11 @@ class probit(likelihood): sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat) return Z_hat, mu_hat, sigma2_hat - def plot1Db(self,X,X_new,F_new,U=None): - assert X.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X_new,F_new,np.zeros(X_new.shape[0])) - pb.plot(X,(self.Y+1)/2,'kx',mew=1.5) - pb.ylim(-0.2,1.2) - if U is not None: - pb.plot(U,U*0+.5,'r|',mew=1.5,markersize=12) - def predictive_mean(self,mu,variance): - return stats.norm.cdf(mu/np.sqrt(1+variance)) + def predictive_mean(self,mu,var): + mu = mu.flatten() + var = var.flatten() + return stats.norm.cdf(mu/np.sqrt(1+var)) def _log_likelihood_gradients(): raise NotImplementedError diff --git a/GPy/models/GP.py b/GPy/models/GP.py new file mode 100644 index 00000000..4a8d23e9 --- /dev/null +++ b/GPy/models/GP.py @@ -0,0 +1,312 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +import pylab as pb +from .. import kern +from ..core import model +from ..util.linalg import pdinv,mdot +from ..util.plot import gpplot, Tango +from ..inference.EP import Full +from ..inference.likelihoods import likelihood,probit,poisson,gaussian + +class GP(model): + """ + Gaussian Process model for regression + + :param X: input observations + :param Y: observed values + :param kernel: a GPy kernel, defaults to rbf+white + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) + :rtype: model object + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]): + #TODO: specify beta parameter explicitely + + # parse arguments + self.Xslices = Xslices + self.X = X + self.N, self.Q = self.X.shape + assert len(self.X.shape)==2 + if kernel is None: + kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) + else: + assert isinstance(kernel, kern.kern) + self.kern = kernel + + #here's some simple normalisation + if normalize_X: + self._Xmean = X.mean(0)[None,:] + self._Xstd = X.std(0)[None,:] + self.X = (X.copy() - self._Xmean) / self._Xstd + if hasattr(self,'Z'): + self.Z = (self.Z - self._Xmean) / self._Xstd + else: + self._Xmean = np.zeros((1,self.X.shape[1])) + self._Xstd = np.ones((1,self.X.shape[1])) + + + # Y - likelihood related variables, these might change whether using EP or not + if likelihood is None: + assert Y is not None, "Either Y or likelihood must be defined" + self.likelihood = gaussian(Y) + else: + self.likelihood = likelihood + assert len(self.likelihood.Y.shape)==2 + assert self.X.shape[0] == self.likelihood.Y.shape[0] + self.N, self.D = self.likelihood.Y.shape + + if isinstance(self.likelihood,gaussian): + self.EP = False + self.Y = Y + + #here's some simple normalisation + if normalize_Y: + self._Ymean = Y.mean(0)[None,:] + self._Ystd = Y.std(0)[None,:] + self.Y = (Y.copy()- self._Ymean) / self._Ystd + else: + self._Ymean = np.zeros((1,self.Y.shape[1])) + self._Ystd = np.ones((1,self.Y.shape[1])) + + if self.D > self.N: + # then it's more efficient to store YYT + self.YYT = np.dot(self.Y, self.Y.T) + else: + self.YYT = None + + else: + # Y is defined after approximating the likelihood + self.EP = True + self.eta,self.delta = powerep + self.epsilon_ep = epsilon_ep + self.tau_tilde = np.ones([self.N,self.D]) + self.v_tilde = np.zeros([self.N,self.D]) + self.tau_ = np.ones([self.N,self.D]) + self.v_ = np.zeros([self.N,self.D]) + self.Z_hat = np.ones([self.N,self.D]) + + model.__init__(self) + + def _set_params(self,p): + # TODO: remove beta when using EP + self.kern._set_params_transformed(p) + if not self.EP: + self.K = self.kern.K(self.X,slices1=self.Xslices) + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + else: + self._ep_covariance() + + def _get_params(self): + # TODO: remove beta when using EP + return self.kern._get_params_transformed() + + def _get_param_names(self): + # TODO: remove beta when using EP + return self.kern._get_param_names_transformed() + + def approximate_likelihood(self): + assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" + self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) + self.tau_tilde, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP() + # Y: EP likelihood is defined as a regression model for mu_tilde + self.Y = self.v_tilde/self.tau_tilde + self._Ymean = np.zeros((1,self.Y.shape[1])) + self._Ystd = np.ones((1,self.Y.shape[1])) + if self.D > self.N: + # then it's more efficient to store YYT + self.YYT = np.dot(self.Y, self.Y.T) + else: + self.YYT = None + self.mu_ = self.v_/self.tau_ + self._ep_covariance() + + def _ep_covariance(self): + # Kernel plus noise variance term + self.K = self.kern.K(self.X,slices1=self.Xslices) + np.diag(1./self.tau_tilde.flatten()) + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + + def _model_fit_term(self): + """ + Computes the model fit using YYT if it's available + """ + if self.YYT is None: + return -0.5*np.sum(np.square(np.dot(self.Li,self.Y))) + else: + return -0.5*np.sum(np.multiply(self.Ki, self.YYT)) + + def _normalization_term(self): + """ + Computes the marginal likelihood normalization constants + """ + sigma_sum = 1./self.tau_ + 1./self.tau_tilde + mu_diff_2 = (self.mu_ - self.Y)**2 + penalty_term = np.sum(np.log(self.Z_hat)) + return penalty_term + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) + + def log_likelihood(self): + """ + The log marginal likelihood for an EP model can be written as the log likelihood of + a regression model for a new variable Y* = v_tilde/tau_tilde, with a covariance + matrix K* = K + diag(1./tau_tilde) plus a normalization term. + """ + complexity_term = -0.5*self.D*self.Kplus_logdet + normalization_term = 0 if self.EP == False else self.normalization_term() + return complexity_term + normalization_term + self._model_fit_term() + + + def log_likelihood(self): + complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet + return complexity_term + self._model_fit_term() + + def dL_dK(self): + if self.YYT is None: + alpha = np.dot(self.Ki,self.Y) + dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) + else: + dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki) + + return dL_dK + + def _log_likelihood_gradients(self): + return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X) + + def predict(self,Xnew, slices=None, full_cov=False): + """ + + Predict the function(s) at the new point(s) Xnew. + + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.Q + :param slices: specifies which outputs kernel(s) the Xnew correspond to (see below) + :type slices: (None, list of slice objects, list of ints) + :param full_cov: whether to return the folll covariance matrix, or just the diagonal + :type full_cov: bool + :rtype: posterior mean, a Numpy array, Nnew x self.D + :rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D) + + .. Note:: "slices" specifies how the the points X_new co-vary wich the training points. + + - If None, the new points covary throigh every kernel part (default) + - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part + - If a list of booleans, specifying which kernel parts are active + + If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. + This is to allow for different normalisations of the output dimensions. + + + """ + + #normalise X values + Xnew = (Xnew.copy() - self._Xmean) / self._Xstd + mu, var, phi = self._raw_predict(Xnew, slices, full_cov) + + #un-normalise + mu = mu*self._Ystd + self._Ymean + if full_cov: + if self.D==1: + var *= np.square(self._Ystd) + else: + var = var[:,:,None] * np.square(self._Ystd) + else: + if self.D==1: + var *= np.square(np.squeeze(self._Ystd)) + else: + var = var[:,None] * np.square(self._Ystd) + + return mu,var,phi + + def _raw_predict(self,_Xnew,slices, full_cov=False): + """Internal helper function for making predictions, does not account for normalisation""" + Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) + mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) + KiKx = np.dot(self.Ki,Kx) + if full_cov: + Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) + var = Kxx - np.dot(KiKx.T,Kx) + else: + Kxx = self.kern.Kdiag(_Xnew, slices=slices) + var = Kxx - np.sum(np.multiply(KiKx,Kx),0) + phi = None if not self.EP else self.likelihood.predictive_mean(mu,var) + return mu, var, phi + + def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): + """ + :param samples: the number of a posteriori samples to plot + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :param which_functions: which of the kernel functions to plot (additively) + :type which_functions: list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions using which_data and which_functions + """ + if which_functions=='all': + which_functions = [True]*self.kern.Nparts + if which_data=='all': + which_data = slice(None) + + X = self.X[which_data,:] + Y = self.Y[which_data,:] + + Xorig = X*self._Xstd + self._Xmean + Yorig = Y*self._Ystd + self._Ymean if not self.EP else self.likelihood.Y + + if plot_limits is None: + xmin,xmax = Xorig.min(0),Xorig.max(0) + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + elif len(plot_limits)==2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + + if self.X.shape[1]==1: + Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] + m,v,phi = self.predict(Xnew,slices=which_functions) + if self.EP: + pb.subplot(211) + + gpplot(Xnew,m,v) + if samples: + s = np.random.multivariate_normal(m.flatten(),v,samples) + pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) + + if not self.EP: + pb.plot(Xorig,Yorig,'kx',mew=1.5) + pb.xlim(xmin,xmax) + else: + pb.xlim(xmin,xmax) + pb.subplot(212) + self.likelihood.plot1Db(self.X,Xnew,phi) + pb.xlim(xmin,xmax) + + elif self.X.shape[1]==2: + resolution = 50 or resolution + xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] + Xtest = np.vstack((xx.flatten(),yy.flatten())).T + zz,vv,phi = self.predict(Xtest,slices=which_functions) + zz = zz.reshape(resolution,resolution) + pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) + pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) + pb.xlim(xmin[0],xmax[0]) + pb.ylim(xmin[1],xmax[1]) + + else: + raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" diff --git a/GPy/models/GP_EP.py b/GPy/models/GP_EP.py index 302ff366..1c0b9cf6 100644 --- a/GPy/models/GP_EP.py +++ b/GPy/models/GP_EP.py @@ -62,7 +62,7 @@ class GP_EP(model): self.L = jitchol(B) V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1) self.Sigma = self.K - np.dot(V.T,V) - self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde) + self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde) * self.Z_hat def log_likelihood(self): """ diff --git a/GPy/models/GP_EP2.py b/GPy/models/GP_EP2.py index c68e7b70..ce869951 100644 --- a/GPy/models/GP_EP2.py +++ b/GPy/models/GP_EP2.py @@ -36,14 +36,11 @@ class GP_EP2(model): self.Xslices = Xslices assert isinstance(kernel, kern.kern) self.likelihood = likelihood - #self.Y = self.likelihood.Y #we might not need this self.kern = kernel self.X = X assert len(self.X.shape)==2 - #assert len(self.Y.shape)==2 - #assert self.X.shape[0] == self.Y.shape[0] - #self.N, self.D = self.Y.shape - self.D = 1 + assert self.X.shape[0] == self.likelihood.Y.shape[0] + self.D = self.likelihood.Y.shape[1] self.N, self.Q = self.X.shape #here's some simple normalisation @@ -75,14 +72,17 @@ class GP_EP2(model): """ self.eta,self.delta = powerep self.epsilon_ep = epsilon_ep - self.tau_tilde = np.zeros([self.N,self.D]) + self.tau_tilde = np.ones([self.N,self.D]) self.v_tilde = np.zeros([self.N,self.D]) + self.tau_ = np.ones([self.N,self.D]) + self.v_ = np.zeros([self.N,self.D]) + self.Z_hat = np.ones([self.N,self.D]) model.__init__(self) def _set_params(self,p): self.kern._set_params_transformed(p) self.K = self.kern.K(self.X,slices1=self.Xslices) - self.posterior_params() + self._ep_params() def _get_params(self): return self.kern._get_params_transformed() @@ -92,52 +92,63 @@ class GP_EP2(model): def approximate_likelihood(self): self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) - self.ep_approx.fit_EP() - self.tau_tilde = self.ep_approx.tau_tilde[:,None] - self.v_tilde = self.ep_approx.tau_tilde[:,None] - self.posterior_params() - self.Y = self.v_tilde/self.tau_tilde - self._Ymean = np.zeros((1,self.Y.shape[1])) - self._Ystd = np.ones((1,self.Y.shape[1])) - #self.YYT = np.dot(self.Y, self.Y.T) + self.tau_tilde, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP() + self._ep_params() - def posterior_params(self): - self.Sroot_tilde_K = np.sqrt(self.tau_tilde.flatten())[:,None]*self.K + def _ep_params(self): + # Posterior mean and Variance computation + self.Sroot_tilde_K = np.sqrt(self.tau_tilde)*self.K B = np.eye(self.N) + np.sqrt(self.tau_tilde.flatten())[None,:]*self.Sroot_tilde_K self.Bi,self.L,self.Li,B_logdet = pdinv(B) V = np.dot(self.Li,self.Sroot_tilde_K) - #V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1) - self.Sigma = self.K - np.dot(V.T,V) - self.mu = np.dot(self.Sigma,self.v_tilde.flatten()) + self.Sigma = self.K - np.dot(V.T,V) #posterior variance + self.mu = np.dot(self.Sigma,self.v_tilde) #posterior mean + # Kernel plus noise variance term + self.Kplus = self.K + np.diag(1./self.tau_tilde.flatten()) + self.Kplusi,self.Lplus,self.Lplusi,self.Kplus_logdet = pdinv(self.Kplus) + # Y: EP likelihood is defined as a regression model for mu_tilde + self.Y = self.v_tilde/self.tau_tilde + self._Ymean = np.zeros((1,self.Y.shape[1])) + self._Ystd = np.ones((1,self.Y.shape[1])) + self.YYT = None #np.dot(self.Y, self.Y.T) + self.mu_ = self.v_/self.tau_ + def _model_fit_term(self): + """ + Computes the model fit using YYT if it's available + """ + if self.YYT is None: + return -0.5*np.sum(np.square(np.dot(self.Lplusi,self.Y))) + else: + return -0.5*np.sum(np.multiply(self.Kplusi, self.YYT)) - #def _model_fit_term(self): - # """ - # Computes the model fit using YYT if it's available - # """ - # if self.YYT is None: - # return -0.5*np.sum(np.square(np.dot(self.Li,self.Y))) - # else: - # return -0.5*np.sum(np.multiply(self.Ki, self.YYT)) + def _normalization_term(self): + """ + Computes the marginal likelihood normalization constants + """ + sigma_sum = 1./self.tau_ + 1./self.tau_tilde + mu_diff_2 = (self.mu_ - self.Y)**2 + penalty_term = np.sum(np.log(self.Z_hat)) + return penalty_term + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) def log_likelihood(self): - mu_ = self.ep_approx.v_/self.ep_approx.tau_ - L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L))) - L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:])) - L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde) - L3 = sum(np.log(self.ep_approx.Z_hat)) - return L1 + L2A + L2B + L3 + """ + The log marginal likelihood for an EP model can be written as the log likelihood of + a regression model for a new variable Y* = v_tilde/tau_tilde, with a covariance + matrix K* = K + diag(1./tau_tilde) plus a normalization term. + """ + complexity_term = -0.5*self.D*self.Kplus_logdet + return complexity_term + self._model_fit_term() + self._normalization_term() - def dL_dK(self): #FIXME + def dL_dK(self): if self.YYT is None: - alpha = np.dot(self.Ki,self.Y) - dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) + alpha = np.dot(self.Kplusi,self.Y) + dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Kplusi) else: - dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki) - + dL_dK = 0.5*(mdot(self.Kplusi, self.YYT, self.Kplusi) - self.D*self.Kplusi) return dL_dK - def _log_likelihood_gradients(self): #FIXME + def _log_likelihood_gradients(self): return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X) def predict(self,Xnew, slices=None, full_cov=False): @@ -189,32 +200,20 @@ class GP_EP2(model): def _raw_predict(self,_Xnew,slices, full_cov=False): """Internal helper function for making predictions, does not account for normalisation""" - """ - Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) - mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) - KiKx = np.dot(self.Ki,Kx) + K_x = self.kern.K(self.X,_Xnew,slices1=self.Xslices,slices2=slices) + aux2 = mdot(self.Bi,self.Sroot_tilde_K,self.v_tilde) + zeta = np.sqrt(self.tau_tilde)*aux2 + f = np.dot(K_x.T,self.v_tilde-zeta) + v = mdot(self.Li,np.sqrt(self.tau_tilde)*K_x) if full_cov: - Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) - var = Kxx - np.dot(KiKx.T,Kx) + Kxx = self.kern.K(_Xnew,slices1=slices,slices2=slices) + var = Kxx - np.dot(v.T,v) + var_diag = np.diag(var)[:,None] else: Kxx = self.kern.Kdiag(_Xnew, slices=slices) - var = Kxx - np.sum(np.multiply(KiKx,Kx),0) - return mu, var - """ - K_x = self.kern.K(self.X,_Xnew) - Kxx = self.kern.K(_Xnew) - #aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1) - #aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0) - #aux2 = mdot(self.Li.T,self.Li,self.Sroot_tilde_K,self.ep_approx.v_tilde) - aux2 = mdot(self.Bi,self.Sroot_tilde_K,self.ep_approx.v_tilde) - zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2 - f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta) - #v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1) - v = mdot(self.Li,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x) - variance = Kxx - np.dot(v.T,v) - vdiag = np.diag(variance) - y=self.likelihood.predictive_mean(f,vdiag) - return f,vdiag,y + var_diag = (Kxx - np.sum(v**2,-2))[:,None] + phi = self.likelihood.predictive_mean(f,var_diag) + return f, var_diag, phi def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): """ @@ -257,7 +256,7 @@ class GP_EP2(model): #gpplot(Xnew,m,v) mu_f, var_f, phi_f = self.predict(Xnew,slices=which_functions) pb.subplot(211) - self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma)) + self.likelihood.plot1Da(X=Xnew,mean=mu_f,var=var_f,Z=self.X,mean_Z=self.mu,var_Z=np.diag(self.Sigma)) if samples: s = np.random.multivariate_normal(m.flatten(),v,samples) pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index 5f824f2b..ca44aab1 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -11,3 +11,4 @@ from GP_EP2 import GP_EP2 from generalized_FITC import generalized_FITC from sparse_GPLVM import sparse_GPLVM from uncollapsed_sparse_GP import uncollapsed_sparse_GP +from GP import GP From 738ca78dac64b0806eeea7bd247849db751e565b Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Fri, 25 Jan 2013 18:24:10 +0000 Subject: [PATCH 03/44] No more GP_EP stuff --- GPy/inference/Expectation_Propagation.py | 240 ------------------- GPy/models/GP_EP.py | 160 ------------- GPy/models/GP_EP2.py | 279 ----------------------- GPy/models/__init__.py | 2 - GPy/models/generalized_FITC.py | 3 +- 5 files changed, 2 insertions(+), 682 deletions(-) delete mode 100644 GPy/inference/Expectation_Propagation.py delete mode 100644 GPy/models/GP_EP.py delete mode 100644 GPy/models/GP_EP2.py diff --git a/GPy/inference/Expectation_Propagation.py b/GPy/inference/Expectation_Propagation.py deleted file mode 100644 index 520fc607..00000000 --- a/GPy/inference/Expectation_Propagation.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - - -import numpy as np -import random -from scipy import stats, linalg -from .likelihoods import likelihood -from ..core import model -from ..util.linalg import pdinv,mdot,jitchol -from ..util.plot import gpplot -from .. import kern - -class EP_base: - """ - Expectation Propagation. - - This is just the base class for expectation propagation. We'll extend it for full and sparse EP. - """ - def __init__(self,likelihood,epsilon=1e-3,powerep=[1.,1.]): - self.likelihood = likelihood - self.epsilon = epsilon - self.eta, self.delta = powerep - self.jitter = 1e-12 - - #Initial values - Likelihood approximation parameters: - #p(y|f) = t(f|tau_tilde,v_tilde) - self.restart_EP() - - def restart_EP(self): - """ - Set the EP approximation to initial state - """ - self.tau_tilde = np.zeros(self.N) - self.v_tilde = np.zeros(self.N) - self.mu = np.zeros(self.N) - -class Full(EP_base): - """ - :param likelihood: Output's likelihood (e.g. probit) - :type likelihood: GPy.inference.likelihood instance - :param K: prior covariance matrix - :type K: np.ndarray (N x N) - :param likelihood: Output's likelihood (e.g. probit) - :type likelihood: GPy.inference.likelihood instance - :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) - :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats) - """ - def __init__(self,K,likelihood,*args,**kwargs): - assert K.shape[0] == K.shape[1] - self.K = K - self.N = self.K.shape[0] - EP_base.__init__(self,likelihood,*args,**kwargs) - def fit_EP(self,messages=False): - """ - The expectation-propagation algorithm. - For nomenclature see Rasmussen & Williams 2006 (pag. 52-60) - """ - #Prior distribution parameters: p(f|X) = N(f|0,K) - #self.K = self.kernel.K(self.X,self.X) - - #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) - self.mu=np.zeros(self.N) - self.Sigma=self.K.copy() - - """ - Initial values - Cavity distribution parameters: - q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)} - sigma_ = 1./tau_ - mu_ = v_/tau_ - """ - - self.tau_ = np.empty(self.N,dtype=np.float64) - self.v_ = np.empty(self.N,dtype=np.float64) - - #Initial values - Marginal moments - z = np.empty(self.N,dtype=np.float64) - self.Z_hat = np.empty(self.N,dtype=np.float64) - phi = np.empty(self.N,dtype=np.float64) - mu_hat = np.empty(self.N,dtype=np.float64) - sigma2_hat = np.empty(self.N,dtype=np.float64) - - #Approximation - epsilon_np1 = self.epsilon + 1. - epsilon_np2 = self.epsilon + 1. - self.iterations = 0 - self.np1 = [self.tau_tilde.copy()] - self.np2 = [self.v_tilde.copy()] - while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.random.permutation(self.N) - for i in update_order: - #Cavity distribution parameters - self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i] - self.v_[i] = self.mu[i]/self.Sigma[i,i] - self.eta*self.v_tilde[i] - #Marginal moments - self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) - #Site parameters update - Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma[i,i]) - Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma[i,i]) - self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau - self.v_tilde[i] = self.v_tilde[i] + Delta_v - #Posterior distribution parameters update - si=self.Sigma[:,i].reshape(self.N,1) - self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T) - self.mu = np.dot(self.Sigma,self.v_tilde) - self.iterations += 1 - #Sigma recomptutation with Cholesky decompositon - Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*(self.K) - B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K - L = jitchol(B) - V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1) - self.Sigma = self.K - np.dot(V.T,V) - self.mu = np.dot(self.Sigma,self.v_tilde) - epsilon_np1 = np.mean(self.tau_tilde-self.np1[-1]**2) - epsilon_np2 = np.mean(self.v_tilde-self.np2[-1]**2) - self.np1.append(self.tau_tilde.copy()) - self.np2.append(self.v_tilde.copy()) - if messages: - print "EP iteration %i, epsilon %d"%(self.iterations,epsilon_np1) - -class FITC(EP_base): - """ - :param likelihood: Output's likelihood (e.g. probit) - :type likelihood: GPy.inference.likelihood instance - :param Knn_diag: The diagonal elements of Knn is a 1D vector - :param Kmn: The 'cross' variance between inducing inputs and data - :param Kmm: the covariance matrix of the inducing inputs - :param likelihood: Output's likelihood (e.g. probit) - :type likelihood: GPy.inference.likelihood instance - :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) - :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats) - """ - def __init__(self,likelihood,Knn_diag,Kmn,Kmm,*args,**kwargs): - self.Knn_diag = Knn_diag - self.Kmn = Kmn - self.Kmm = Kmm - self.M = self.Kmn.shape[0] - self.N = self.Kmn.shape[1] - assert self.M <= self.N, 'The number of inducing inputs must be smaller than the number of observations' - assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N' - EP_base.__init__(self,likelihood,*args,**kwargs) - - def fit_EP(self): - """ - The expectation-propagation algorithm with sparse pseudo-input. - For nomenclature see Naish-Guzman and Holden, 2008. - """ - - """ - Prior approximation parameters: - q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) - Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn - """ - self.Kmmi, self.Kmm_hld = pdinv(self.Kmm) - self.P0 = self.Kmn.T - self.KmnKnm = np.dot(self.P0.T, self.P0) - self.KmmiKmn = np.dot(self.Kmmi,self.P0.T) - self.Qnn_diag = np.sum(self.P0.T*self.KmmiKmn,-2) - self.Diag0 = self.Knn_diag - self.Qnn_diag - self.R0 = jitchol(self.Kmmi).T - - """ - Posterior approximation: q(f|y) = N(f| mu, Sigma) - Sigma = Diag + P*R.T*R*P.T + K - mu = w + P*gamma - """ - self.w = np.zeros(self.N) - self.gamma = np.zeros(self.M) - self.mu = np.zeros(self.N) - self.P = self.P0.copy() - self.R = self.R0.copy() - self.Diag = self.Diag0.copy() - self.Sigma_diag = self.Knn_diag - - """ - Initial values - Cavity distribution parameters: - q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)} - sigma_ = 1./tau_ - mu_ = v_/tau_ - """ - self.tau_ = np.empty(self.N,dtype=np.float64) - self.v_ = np.empty(self.N,dtype=np.float64) - - #Initial values - Marginal moments - z = np.empty(self.N,dtype=np.float64) - self.Z_hat = np.empty(self.N,dtype=np.float64) - phi = np.empty(self.N,dtype=np.float64) - mu_hat = np.empty(self.N,dtype=np.float64) - sigma2_hat = np.empty(self.N,dtype=np.float64) - - #Approximation - epsilon_np1 = 1 - epsilon_np2 = 1 - self.iterations = 0 - self.np1 = [self.tau_tilde.copy()] - self.np2 = [self.v_tilde.copy()] - while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.arange(self.N) - random.shuffle(update_order) - for i in update_order: - #Cavity distribution parameters - self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i] - self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i] - #Marginal moments - self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) - #Site parameters update - Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i]) - Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i]) - self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau - self.v_tilde[i] = self.v_tilde[i] + Delta_v - #Posterior distribution parameters update - dtd1 = Delta_tau*self.Diag[i] + 1. - dii = self.Diag[i] - self.Diag[i] = dii - (Delta_tau * dii**2.)/dtd1 - pi_ = self.P[i,:].reshape(1,self.M) - self.P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_ - Rp_i = np.dot(self.R,pi_.T) - RTR = np.dot(self.R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*self.Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),self.R)) - self.R = jitchol(RTR).T - self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1 - self.gamma = self.gamma + (Delta_v - Delta_tau*self.mu[i])*np.dot(RTR,self.P[i,:].T) - self.RPT = np.dot(self.R,self.P.T) - self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1) - self.mu = self.w + np.dot(self.P,self.gamma) - self.iterations += 1 - #Sigma recomptutation with Cholesky decompositon - self.Diag = self.Diag0/(1.+ self.Diag0 * self.tau_tilde) - self.P = (self.Diag / self.Diag0)[:,None] * self.P0 - self.RPT0 = np.dot(self.R0,self.P0.T) - L = jitchol(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T)) - self.R,info = linalg.flapack.dtrtrs(L,self.R0,lower=1) - self.RPT = np.dot(self.R,self.P.T) - self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1) - self.w = self.Diag * self.v_tilde - self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.v_tilde)) - self.mu = self.w + np.dot(self.P,self.gamma) - epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N - epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N - self.np1.append(self.tau_tilde.copy()) - self.np2.append(self.v_tilde.copy()) diff --git a/GPy/models/GP_EP.py b/GPy/models/GP_EP.py deleted file mode 100644 index 1c0b9cf6..00000000 --- a/GPy/models/GP_EP.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - - -import numpy as np -import pylab as pb -from scipy import stats, linalg -from .. import kern -from ..inference.Expectation_Propagation import Full -from ..inference.likelihoods import likelihood,probit#,poisson,gaussian -from ..core import model -from ..util.linalg import pdinv,jitchol -from ..util.plot import gpplot - -class GP_EP(model): - def __init__(self,X,likelihood,kernel=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]): - """ - Simple Gaussian Process with Non-Gaussian likelihood - - Arguments - --------- - :param X: input observations (NxD numpy.darray) - :param likelihood: a GPy likelihood (likelihood class) - :param kernel: a GPy kernel (kern class) - :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 (float) - :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list) - :rtype: GPy model class. - """ - if kernel is None: - kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) - - assert isinstance(kernel,kern.kern), 'kernel is not a kern instance' - self.likelihood = likelihood - self.Y = self.likelihood.Y - self.kernel = kernel - self.X = X - self.N, self.D = self.X.shape - self.eta,self.delta = powerep - self.epsilon_ep = epsilon_ep - self.jitter = 1e-12 - self.K = self.kernel.K(self.X) - model.__init__(self) - - def _set_params(self,p): - self.kernel._set_params_transformed(p) - - def _get_params(self): - return self.kernel._get_params_transformed() - - def _get_param_names(self): - return self.kernel._get_param_names_transformed() - - def approximate_likelihood(self): - self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) - self.ep_approx.fit_EP() - - def posterior_param(self): - self.K = self.kernel.K(self.X) - self.Sroot_tilde_K = np.sqrt(self.ep_approx.tau_tilde)[:,None]*self.K - B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)*self.Sroot_tilde_K - #self.L = np.linalg.cholesky(B) - self.L = jitchol(B) - V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1) - self.Sigma = self.K - np.dot(V.T,V) - self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde) * self.Z_hat - - def log_likelihood(self): - """ - Returns - ------- - The EP approximation to the log-marginal likelihood - """ - self.posterior_param() - mu_ = self.ep_approx.v_/self.ep_approx.tau_ - L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L))) - L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:])) - L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde) - L3 = sum(np.log(self.ep_approx.Z_hat)) - return L1 + L2A + L2B + L3 - - def _log_likelihood_gradients(self): - dK_dp = self.kernel.dK_dtheta(self.X) - self.dK_dp = dK_dp - aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1) - b = self.ep_approx.v_tilde - np.sqrt(self.ep_approx.tau_tilde)*linalg.flapack.dtrtrs(self.L.T,aux1)[0] - U,info_u = linalg.flapack.dtrtrs(self.L,np.diag(np.sqrt(self.ep_approx.tau_tilde)),lower=1) - dL_dK = 0.5*(np.outer(b,b)-np.dot(U.T,U)) - self.dL_dK = dL_dK - return np.array([np.sum(dK_dpi*dL_dK) for dK_dpi in dK_dp.T]) - - def predict(self,X): - #TODO: check output dimensions - self.posterior_param() - K_x = self.kernel.K(self.X,X) - Kxx = self.kernel.K(X) - aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1) - aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0) - zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2 - f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta) - v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1) - variance = Kxx - np.dot(v.T,v) - vdiag = np.diag(variance) - y=self.likelihood.predictive_mean(f,vdiag) - return f,vdiag,y - - def plot(self): - """ - Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals. - """ - if self.X.shape[1]==1: - pb.figure() - xmin,xmax = self.X.min(),self.X.max() - xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) - Xnew = np.linspace(xmin,xmax,100)[:,None] - mu_f, var_f, mu_phi = self.predict(Xnew) - pb.subplot(211) - self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma)) - pb.subplot(212) - self.likelihood.plot1Db(self.X,Xnew,mu_phi) - elif self.X.shape[1]==2: - pb.figure() - x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0) - x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0) - x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min) - x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min) - axis1 = np.linspace(x1min,x1max,50) - axis2 = np.linspace(x2min,x2max,50) - XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)] - Xnew = np.c_[XX1.flatten(),XX2.flatten()] - f,v,p = self.predict(Xnew) - self.likelihood.plot2D(self.X,Xnew,p) - else: - raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" - - def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense - """ - Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved. - """ - self.epsilon_em = epsilon - log_likelihood_change = self.epsilon_em + 1. - self.parameters_path = [self.kernel._get_params()] - self.approximate_likelihood() - self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]] - self.log_likelihood_path = [self.log_likelihood()] - iteration = 0 - while log_likelihood_change > self.epsilon_em: - print 'EM iteration', iteration - self.optimize(max_f_eval = max_f_eval) - log_likelihood_new = self.log_likelihood() - log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1] - if log_likelihood_change < 0: - print 'log_likelihood decrement' - self.kernel._set_params_transformed(self.parameters_path[-1]) - self.kernM._set_params_transformed(self.parameters_path[-1]) - else: - self.approximate_likelihood() - self.log_likelihood_path.append(self.log_likelihood()) - self.parameters_path.append(self.kernel._get_params()) - self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde]) - iteration += 1 diff --git a/GPy/models/GP_EP2.py b/GPy/models/GP_EP2.py deleted file mode 100644 index ce869951..00000000 --- a/GPy/models/GP_EP2.py +++ /dev/null @@ -1,279 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -import numpy as np -import pylab as pb -from scipy import stats, linalg -from .. import kern -from ..inference.EP import Full -from ..inference.likelihoods import likelihood,probit,poisson,gaussian -from ..core import model -from ..util.linalg import pdinv,mdot #,jitchol -from ..util.plot import gpplot, Tango - -class GP_EP2(model): - def __init__(self,X,likelihood,kernel=None,normalize_X=False,Xslices=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]): - """ - Simple Gaussian Process with Non-Gaussian likelihood - - Arguments - --------- - :param X: input observations (NxD numpy.darray) - :param likelihood: a GPy likelihood (likelihood class) - :param kernel: a GPy kernel, defaults to rbf+white - :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) - :type normalize_X: False|True - :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 1e-3 - :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list) - :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) - :rtype: model object. - """ - #.. Note:: Multiple independent outputs are allowed using columns of Y #TODO add this note? - if kernel is None: - kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) - - # parse arguments - self.Xslices = Xslices - assert isinstance(kernel, kern.kern) - self.likelihood = likelihood - self.kern = kernel - self.X = X - assert len(self.X.shape)==2 - assert self.X.shape[0] == self.likelihood.Y.shape[0] - self.D = self.likelihood.Y.shape[1] - self.N, self.Q = self.X.shape - - #here's some simple normalisation - if normalize_X: - self._Xmean = X.mean(0)[None,:] - self._Xstd = X.std(0)[None,:] - self.X = (X.copy() - self._Xmean) / self._Xstd - if hasattr(self,'Z'): - self.Z = (self.Z - self._Xmean) / self._Xstd - else: - self._Xmean = np.zeros((1,self.X.shape[1])) - self._Xstd = np.ones((1,self.X.shape[1])) - - #THIS PART IS NOT NEEDED - """ - if normalize_Y: - self._Ymean = Y.mean(0)[None,:] - self._Ystd = Y.std(0)[None,:] - self.Y = (Y.copy()- self._Ymean) / self._Ystd - else: - self._Ymean = np.zeros((1,self.Y.shape[1])) - self._Ystd = np.ones((1,self.Y.shape[1])) - - if self.D > self.N: - # then it's more efficient to store YYT - self.YYT = np.dot(self.Y, self.Y.T) - else: - self.YYT = None - """ - self.eta,self.delta = powerep - self.epsilon_ep = epsilon_ep - self.tau_tilde = np.ones([self.N,self.D]) - self.v_tilde = np.zeros([self.N,self.D]) - self.tau_ = np.ones([self.N,self.D]) - self.v_ = np.zeros([self.N,self.D]) - self.Z_hat = np.ones([self.N,self.D]) - model.__init__(self) - - def _set_params(self,p): - self.kern._set_params_transformed(p) - self.K = self.kern.K(self.X,slices1=self.Xslices) - self._ep_params() - - def _get_params(self): - return self.kern._get_params_transformed() - - def _get_param_names(self): - return self.kern._get_param_names_transformed() - - def approximate_likelihood(self): - self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) - self.tau_tilde, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP() - self._ep_params() - - def _ep_params(self): - # Posterior mean and Variance computation - self.Sroot_tilde_K = np.sqrt(self.tau_tilde)*self.K - B = np.eye(self.N) + np.sqrt(self.tau_tilde.flatten())[None,:]*self.Sroot_tilde_K - self.Bi,self.L,self.Li,B_logdet = pdinv(B) - V = np.dot(self.Li,self.Sroot_tilde_K) - self.Sigma = self.K - np.dot(V.T,V) #posterior variance - self.mu = np.dot(self.Sigma,self.v_tilde) #posterior mean - # Kernel plus noise variance term - self.Kplus = self.K + np.diag(1./self.tau_tilde.flatten()) - self.Kplusi,self.Lplus,self.Lplusi,self.Kplus_logdet = pdinv(self.Kplus) - # Y: EP likelihood is defined as a regression model for mu_tilde - self.Y = self.v_tilde/self.tau_tilde - self._Ymean = np.zeros((1,self.Y.shape[1])) - self._Ystd = np.ones((1,self.Y.shape[1])) - self.YYT = None #np.dot(self.Y, self.Y.T) - self.mu_ = self.v_/self.tau_ - - def _model_fit_term(self): - """ - Computes the model fit using YYT if it's available - """ - if self.YYT is None: - return -0.5*np.sum(np.square(np.dot(self.Lplusi,self.Y))) - else: - return -0.5*np.sum(np.multiply(self.Kplusi, self.YYT)) - - def _normalization_term(self): - """ - Computes the marginal likelihood normalization constants - """ - sigma_sum = 1./self.tau_ + 1./self.tau_tilde - mu_diff_2 = (self.mu_ - self.Y)**2 - penalty_term = np.sum(np.log(self.Z_hat)) - return penalty_term + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) - - def log_likelihood(self): - """ - The log marginal likelihood for an EP model can be written as the log likelihood of - a regression model for a new variable Y* = v_tilde/tau_tilde, with a covariance - matrix K* = K + diag(1./tau_tilde) plus a normalization term. - """ - complexity_term = -0.5*self.D*self.Kplus_logdet - return complexity_term + self._model_fit_term() + self._normalization_term() - - def dL_dK(self): - if self.YYT is None: - alpha = np.dot(self.Kplusi,self.Y) - dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Kplusi) - else: - dL_dK = 0.5*(mdot(self.Kplusi, self.YYT, self.Kplusi) - self.D*self.Kplusi) - return dL_dK - - def _log_likelihood_gradients(self): - return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X) - - def predict(self,Xnew, slices=None, full_cov=False): - """ - - Predict the function(s) at the new point(s) Xnew. - - Arguments - --------- - :param Xnew: The points at which to make a prediction - :type Xnew: np.ndarray, Nnew x self.Q - :param slices: specifies which outputs kernel(s) the Xnew correspond to (see below) - :type slices: (None, list of slice objects, list of ints) - :param full_cov: whether to return the folll covariance matrix, or just the diagonal - :type full_cov: bool - :rtype: posterior mean, a Numpy array, Nnew x self.D - :rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D) - - .. Note:: "slices" specifies how the the points X_new co-vary wich the training points. - - - If None, the new points covary throigh every kernel part (default) - - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part - - If a list of booleans, specifying which kernel parts are active - - If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. - This is to allow for different normalisations of the output dimensions. - - - """ - - #normalise X values - Xnew = (Xnew.copy() - self._Xmean) / self._Xstd - mu, var, phi = self._raw_predict(Xnew, slices, full_cov) - - #un-normalise - mu = mu*self._Ystd + self._Ymean - if full_cov: - if self.D==1: - var *= np.square(self._Ystd) - else: - var = var[:,:,None] * np.square(self._Ystd) - else: - if self.D==1: - var *= np.square(np.squeeze(self._Ystd)) - else: - var = var[:,None] * np.square(self._Ystd) - - return mu,var,phi - - def _raw_predict(self,_Xnew,slices, full_cov=False): - """Internal helper function for making predictions, does not account for normalisation""" - K_x = self.kern.K(self.X,_Xnew,slices1=self.Xslices,slices2=slices) - aux2 = mdot(self.Bi,self.Sroot_tilde_K,self.v_tilde) - zeta = np.sqrt(self.tau_tilde)*aux2 - f = np.dot(K_x.T,self.v_tilde-zeta) - v = mdot(self.Li,np.sqrt(self.tau_tilde)*K_x) - if full_cov: - Kxx = self.kern.K(_Xnew,slices1=slices,slices2=slices) - var = Kxx - np.dot(v.T,v) - var_diag = np.diag(var)[:,None] - else: - Kxx = self.kern.Kdiag(_Xnew, slices=slices) - var_diag = (Kxx - np.sum(v**2,-2))[:,None] - phi = self.likelihood.predictive_mean(f,var_diag) - return f, var_diag, phi - - def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): - """ - :param samples: the number of a posteriori samples to plot - :param which_data: which if the training data to plot (default all) - :type which_data: 'all' or a slice object to slice self.X, self.Y - :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits - :param which_functions: which of the kernel functions to plot (additively) - :type which_functions: list of bools - :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D - - Plot the posterior of the GP. - - In one dimension, the function is plotted with a shaded region identifying two standard deviations. - - In two dimsensions, a contour-plot shows the mean predicted function - - In higher dimensions, we've no implemented this yet !TODO! - - Can plot only part of the data and part of the posterior functions using which_data and which_functions - """ - if which_functions=='all': - which_functions = [True]*self.kern.Nparts - if which_data=='all': - which_data = slice(None) - - X = self.X[which_data,:] - Y = self.Y[which_data,:] - - Xorig = X*self._Xstd + self._Xmean - Yorig = Y*self._Ystd + self._Ymean - if plot_limits is None: - xmin,xmax = Xorig.min(0),Xorig.max(0) - xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) - elif len(plot_limits)==2: - xmin, xmax = plot_limits - else: - raise ValueError, "Bad limits for plotting" - - if self.X.shape[1]==1: - Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - #m,v,phi = self.predict(Xnew,slices=which_functions) - #gpplot(Xnew,m,v) - mu_f, var_f, phi_f = self.predict(Xnew,slices=which_functions) - pb.subplot(211) - self.likelihood.plot1Da(X=Xnew,mean=mu_f,var=var_f,Z=self.X,mean_Z=self.mu,var_Z=np.diag(self.Sigma)) - if samples: - s = np.random.multivariate_normal(m.flatten(),v,samples) - pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - pb.xlim(xmin,xmax) - pb.subplot(212) - self.likelihood.plot1Db(self.X,Xnew,phi_f) - - elif self.X.shape[1]==2: - resolution = 50 or resolution - xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] - Xtest = np.vstack((xx.flatten(),yy.flatten())).T - zz,vv = self.predict(Xtest,slices=which_functions) - zz = zz.reshape(resolution,resolution) - pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) - pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) - pb.xlim(xmin[0],xmax[0]) - pb.ylim(xmin[1],xmax[1]) - - else: - raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index ca44aab1..cc2f62d6 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -6,8 +6,6 @@ from GP_regression import GP_regression from sparse_GP_regression import sparse_GP_regression from GPLVM import GPLVM from warped_GP import warpedGP -from GP_EP import GP_EP -from GP_EP2 import GP_EP2 from generalized_FITC import generalized_FITC from sparse_GPLVM import sparse_GPLVM from uncollapsed_sparse_GP import uncollapsed_sparse_GP diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py index a5ed8d0a..57ae2407 100644 --- a/GPy/models/generalized_FITC.py +++ b/GPy/models/generalized_FITC.py @@ -9,7 +9,8 @@ from .. import kern from ..core import model from ..util.linalg import pdinv,mdot from ..util.plot import gpplot -from ..inference.Expectation_Propagation import FITC +#from ..inference.Expectation_Propagation import FITC +from ..inference.EP import FITC from ..inference.likelihoods import likelihood,probit class generalized_FITC(model): From fad0e07624971b0e381db34806b1b27ae7d27fcb Mon Sep 17 00:00:00 2001 From: Ricardo Date: Mon, 28 Jan 2013 00:16:23 +0000 Subject: [PATCH 04/44] Sparse EP --- GPy/examples/ep_fix.py | 5 +- GPy/examples/poisson.py | 50 +++++++ GPy/examples/sparse_ep_fix.py | 76 ++++++++++ GPy/inference/EP.py | 9 +- GPy/models/GP.py | 8 +- GPy/models/__init__.py | 1 + GPy/models/sparse_GP.py | 258 ++++++++++++++++++++++++++++++++++ 7 files changed, 399 insertions(+), 8 deletions(-) create mode 100644 GPy/examples/poisson.py create mode 100644 GPy/examples/sparse_ep_fix.py create mode 100644 GPy/models/sparse_GP.py diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index 2da94335..9b35b3ff 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -10,6 +10,7 @@ import numpy as np import GPy pb.ion() +pb.close('all') default_seed=10000 model_type='Full' @@ -26,11 +27,13 @@ data = GPy.util.datasets.toy_linear_1d_classification(seed=seed) likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) m = GPy.models.GP(data['X'],likelihood=likelihood) +#m = GPy.models.GP(data['X'],Y=likelihood.Y) m.constrain_positive('var') m.constrain_positive('len') m.tie_param('lengthscale') -m.approximate_likelihood() +if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): + m.approximate_likelihood() print m.checkgrad() # Optimize and plot m.optimize() diff --git a/GPy/examples/poisson.py b/GPy/examples/poisson.py new file mode 100644 index 00000000..5a1cc6af --- /dev/null +++ b/GPy/examples/poisson.py @@ -0,0 +1,50 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +""" +Simple Gaussian Processes classification +""" +import pylab as pb +import numpy as np +import GPy +pb.ion() + +pb.close('all') +default_seed=10000 + +model_type='Full' +inducing=4 +seed=default_seed +"""Simple 1D classification example. +:param model_type: type of model to fit ['Full', 'FITC', 'DTC']. +:param seed : seed value for data generation (default is 4). +:type seed: int +:param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). +:type inducing: int +""" + +X = np.arange(0,100,5)[:,None] +F = np.round(np.sin(X/18.) + .1*X) +E = np.random.randint(-3,3,20)[:,None] +Y = F + E +pb.plot(X,F,'k-') +pb.plot(X,Y,'ro') +pb.figure() +likelihood = GPy.inference.likelihoods.poisson(Y,scale=4.) + +m = GPy.models.GP(X,likelihood=likelihood) +#m = GPy.models.GP(data['X'],Y=likelihood.Y) + +m.constrain_positive('var') +m.constrain_positive('len') +m.tie_param('lengthscale') +if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): + m.approximate_likelihood() +print m.checkgrad() +# Optimize and plot +m.optimize() +#m.em(plot_all=False) # EM algorithm +m.plot() + +print(m) diff --git a/GPy/examples/sparse_ep_fix.py b/GPy/examples/sparse_ep_fix.py new file mode 100644 index 00000000..738a82e6 --- /dev/null +++ b/GPy/examples/sparse_ep_fix.py @@ -0,0 +1,76 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +""" +Sparse Gaussian Processes regression with an RBF kernel +""" +import pylab as pb +import numpy as np +import GPy +np.random.seed(2) +pb.ion() +N = 500 +M = 5 + +###################################### +## 1 dimensional example + +# sample inputs and outputs +X = np.random.uniform(-3.,3.,(N,1)) +#Y = np.sin(X)+np.random.randn(N,1)*0.05 +F = np.sin(X)+np.random.randn(N,1)*0.05 +Y = np.ones([F.shape[0],1]) +Y[F<0] = -1 +likelihood = GPy.inference.likelihoods.probit(Y) + +# construct kernel +rbf = GPy.kern.rbf(1) +noise = GPy.kern.white(1) +kernel = rbf + noise + +# create simple GP model +#m1 = GPy.models.sparse_GP_regression(X, Y, kernel, M=M) +m1 = GPy.models.sparse_GP(X, kernel, M=M,likelihood= likelihood) + +# contrain all parameters to be positive +m1.constrain_positive('(variance|lengthscale|precision)') +#m1.constrain_positive('(variance|lengthscale)') +#m1.constrain_fixed('prec',10.) + + +#check gradient FIXME unit test please +m1.checkgrad() +# optimize and plot +m1.optimize('tnc', messages = 1) +m1.plot() +# print(m1) + +###################################### +## 2 dimensional example + +# # sample inputs and outputs +# X = np.random.uniform(-3.,3.,(N,2)) +# Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05 + +# # construct kernel +# rbf = GPy.kern.rbf(2) +# noise = GPy.kern.white(2) +# kernel = rbf + noise + +# # create simple GP model +# m2 = GPy.models.sparse_GP_regression(X,Y,kernel, M = 50) +# create simple GP model + +# # contrain all parameters to be positive (but not inducing inputs) +# m2.constrain_positive('(variance|lengthscale|precision)') + +# #check gradient FIXME unit test please +# m2.checkgrad() + +# # optimize and plot +# pb.figure() +# m2.optimize('tnc', messages = 1) +# m2.plot() +# print(m2) diff --git a/GPy/inference/EP.py b/GPy/inference/EP.py index f7c163b1..751d5ca8 100644 --- a/GPy/inference/EP.py +++ b/GPy/inference/EP.py @@ -9,7 +9,7 @@ from ..util.plot import gpplot from .. import kern class EP: - def __init__(self,covariance,likelihood,Kmn=None,Knn_diag=None,epsilon=1e-3,powerep=[1.,1.]): + def __init__(self,covariance,likelihood,Kmn=None,Knn_diag=None,epsilon=1e-3,power_ep=[1.,1.]): """ Expectation Propagation @@ -19,7 +19,7 @@ class EP: likelihood : Output's likelihood (likelihood class) kernel : a GPy kernel (kern class) inducing : Either an array specifying the inducing points location or a sacalar defining their number. None value for using a non-sparse model is used. - powerep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats) + power_ep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats) epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) """ self.likelihood = likelihood @@ -38,7 +38,7 @@ class EP: assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N' self.epsilon = epsilon - self.eta, self.delta = powerep + self.eta, self.delta = power_ep self.jitter = 1e-12 """ @@ -110,6 +110,7 @@ class Full(EP): self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T) self.mu = np.dot(self.Sigma,self.v_tilde) self.iterations += 1 + print self.tau_tilde[i] #TODO erase me #Sigma recomptutation with Cholesky decompositon Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*(self.K) B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K @@ -206,6 +207,7 @@ class DTC(EP): epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) + return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None] class FITC(EP): def fit_EP(self): @@ -306,3 +308,4 @@ class FITC(EP): epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) + return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None] diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 4a8d23e9..ccfe95c7 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -29,8 +29,8 @@ class GP(model): """ - def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]): - #TODO: specify beta parameter explicitely + def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,epsion_em=.1,power_ep=[1.,1.]): + #TODO: make beta parameter explicit # parse arguments self.Xslices = Xslices @@ -87,7 +87,7 @@ class GP(model): else: # Y is defined after approximating the likelihood self.EP = True - self.eta,self.delta = powerep + self.eta,self.delta = power_ep self.epsilon_ep = epsilon_ep self.tau_tilde = np.ones([self.N,self.D]) self.v_tilde = np.zeros([self.N,self.D]) @@ -116,7 +116,7 @@ class GP(model): def approximate_likelihood(self): assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" - self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) + self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) self.tau_tilde, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP() # Y: EP likelihood is defined as a regression model for mu_tilde self.Y = self.v_tilde/self.tau_tilde diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index cc2f62d6..a839f827 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -10,3 +10,4 @@ from generalized_FITC import generalized_FITC from sparse_GPLVM import sparse_GPLVM from uncollapsed_sparse_GP import uncollapsed_sparse_GP from GP import GP +from sparse_GP import sparse_GP diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py new file mode 100644 index 00000000..1164a1af --- /dev/null +++ b/GPy/models/sparse_GP.py @@ -0,0 +1,258 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, pdinv +from ..util.plot import gpplot +from .. import kern +from GP import GP +from ..inference.EP import Full +from ..inference.likelihoods import likelihood,probit,poisson,gaussian + +#Still TODO: +# make use of slices properly (kernel can now do this) +# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) + +class sparse_GP(GP): + """ + Variational sparse GP model (Regression) + + :param X: inputs + :type X: np.ndarray (N x Q) + :param Y: observed data + :type Y: np.ndarray of observations (N x D) + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) + :type X_uncertainty: np.ndarray (N x Q) | None + :param Zslices: slices for the inducing inputs (see slicing TODO: link) + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param beta: noise precision. TODO> ignore beta if doing EP + :type beta: float + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,epsilon_em=.1,power_ep=[1.,1.]): + + if Z is None: + self.Z = np.random.permutation(X.copy())[:M] + self.M = M + else: + assert Z.shape[1]==X.shape[1] + self.Z = Z + self.M = Z.shape[0] + if X_uncertainty is None: + self.has_uncertain_inputs=False + else: + assert X_uncertainty.shape==X.shape + self.has_uncertain_inputs=True + self.X_uncertainty = X_uncertainty + + + self.beta = beta #FIXME + GP.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,epsion_em=epsilon_em,power_ep=power_ep) + self.beta = beta if isinstance(likelihood,gaussian) else self.tau_tilde #TODO this should be defined in GP.__init__ + + + #normalise X uncertainty also + if self.has_uncertain_inputs: + self.X_uncertainty /= np.square(self._Xstd) + + def _set_params(self, p): + if not self.EP: + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) + self.beta = p[self.M*self.Q] + self.kern._set_params(p[self.Z.size + 1:]) + self.beta2 = self.beta**2 + self._compute_kernel_matrices() + self._computations() + else: + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) + self.kern._set_params(p[self.Z.size:]) + #self._compute_kernel_matrices() this is replaced by _ep_covariance + self._ep_covariance() + self._ep_computations() + + def approximate_likelihood(self): + assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" + if self.ep_proxy == 'DTC': + self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + elif self.ep_proxy == 'FITC': + self.Knn_diag = self.kern.psi0(self.Z,self.X, self.X_uncertainty) #TODO psi0 already calculates this + self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.Knn_diag,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + else: + self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + self.beta, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP() + # Y: EP likelihood is defined as a regression model for mu_tilde + self.Y = self.v_tilde/self.beta + self._Ymean = np.zeros((1,self.Y.shape[1])) + self._Ystd = np.ones((1,self.Y.shape[1])) + self.trbetaYYT = np.sum(self.beta*np.square(self.Y)) + if self.D > self.N: + # then it's more efficient to store YYT + self.YYT = np.dot(self.Y, self.Y.T) + else: + self.YYT = None + self.mu_ = self.v_/self.tau_ + self._ep_covariance() + self._computations() + + def _ep_covariance(self): + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) #FIXME include beta + else: + #self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() + self.Knn_diag = self.kern.Kdiag(self.X,slices=self.Xslices) + self.psi0 = (self.beta*self.Knn_diag).sum() #TODO check dimensions + self.psi1 = self.kern.K(self.Z,self.X) + #self.psi2 = np.dot(self.psi1,self.psi1.T) + self.psi2 = np.dot(self.psi1,self.beta*self.psi1.T) + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + #TODO: slices for psi statistics (easy enough) + + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() + self.psi1 = self.kern.K(self.Z,self.X) + self.psi2 = np.dot(self.psi1,self.psi1.T) + + def _ep_computations(self): + # TODO find routine to multiply triangular matrices + self.V = self.beta*self.Y + self.psi1V = np.dot(self.psi1, self.V) + self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) + #self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T) + self.A = mdot(self.Lmi, self.psi2, self.Lmi.T) + self.B = np.eye(self.M) + self.A + self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) + self.LLambdai = np.dot(self.LBi, self.Lmi) + #self.trace_K = self.psi0 - np.sum(np.dot(self.Lmi,self.psi1)**2,-1) #TODO check + self.trace_K = self.psi0 - np.trace(self.A) + self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) + self.C = mdot(self.LLambdai, self.psi1V) + self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) + + # Compute dL_dpsi + #self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() * np.ones(self.N) #TODO check + self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) + #self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + + # Compute dL_dKmm + self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB + self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE + + def _get_params(self): + if not self.EP: + return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()]) + else: + return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()]) + + def _get_param_names(self): + if not self.EP: + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed() + else: + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed() + + def log_likelihood(self): + """ + Compute the (lower bound on the) log marginal likelihood + """ + beta_logdet = self.N*self.D*np.log(self.beta) if not self.EP else self.D*np.sum(np.log(self.beta)) + A = -0.5*self.N*self.D*(np.log(2.*np.pi)) - 0.5*beta_logdet + B = -0.5*self.beta*self.D*self.trace_K if not self.EP else -0.5*self.D*self.trace_K + C = -0.5*self.D * self.B_logdet + D = -0.5*self.beta*self.trYYT if not self.EP else -0.5*self.trbetaYYT + E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) + return A+B+C+D+E + + def dL_dbeta(self): + """ + Compute the gradient of the log likelihood wrt beta. + """ + #TODO: suport heteroscedatic noise + dA_dbeta = 0.5 * self.N*self.D/self.beta + dB_dbeta = - 0.5 * self.D * self.trace_K + dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta + dD_dbeta = - 0.5 * self.trYYT + tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V) + dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta + + return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta) + + def dL_dtheta(self): + """ + Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel + """ + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) + + return dL_dtheta + + def dL_dZ(self): + """ + The derivative of the bound wrt the inducing inputs Z + """ + dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ + if self.has_uncertain_inputs: + dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) + return dL_dZ + + def _log_likelihood_gradients(self): + return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) + + def _raw_predict(self, Xnew, slices, full_cov=False): + """Internal helper function for making predictions, does not account for normalisation""" + Kx = self.kern.K(self.Z, Xnew) + mu = mdot(Kx.T, self.LBL_inv, self.psi1V) + if full_cov: + noise_term = np.eye(Xnew.shape[0])/self.beta if not self.EP else 0 + Kxx = self.kern.K(Xnew) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + noise_term + else: + noise_term = 1./self.beta if not self.EP else 0 + Kxx = self.kern.Kdiag(Xnew) + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) + noise_term + return mu,var + + def plot(self, *args, **kwargs): + """ + Plot the fitted model: just call the GP_regression plot function and then add inducing inputs + """ + GP_regression.plot(self,*args,**kwargs) + if self.Q==1: + pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) + if self.has_uncertain_inputs: + pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten())) + if self.Q==2: + pb.plot(self.Z[:,0],self.Z[:,1],'wo') From 29ec128c9d6620b20989c9bdb27de95c098927ef Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Mon, 28 Jan 2013 17:47:08 +0000 Subject: [PATCH 05/44] Other changes. --- GPy/examples/ep_fix.py | 12 ++-- GPy/examples/poisson.py | 2 +- GPy/examples/sparse_ep_fix.py | 34 +-------- GPy/inference/EP.py | 9 ++- GPy/inference/likelihoods.py | 32 ++++++++- GPy/models/GP.py | 92 +++++++++++-------------- GPy/models/sparse_GP.py | 126 +++++++++++++++++++++------------- 7 files changed, 164 insertions(+), 143 deletions(-) diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index 9b35b3ff..c4e025dd 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -11,11 +11,9 @@ import GPy pb.ion() pb.close('all') -default_seed=10000 model_type='Full' inducing=4 -seed=default_seed """Simple 1D classification example. :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. :param seed : seed value for data generation (default is 4). @@ -23,21 +21,19 @@ seed=default_seed :param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). :type inducing: int """ -data = GPy.util.datasets.toy_linear_1d_classification(seed=seed) +data = GPy.util.datasets.toy_linear_1d_classification(seed=0) likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) m = GPy.models.GP(data['X'],likelihood=likelihood) -#m = GPy.models.GP(data['X'],Y=likelihood.Y) +#m = GPy.models.GP(data['X'],likelihood.Y) -m.constrain_positive('var') -m.constrain_positive('len') -m.tie_param('lengthscale') +m.ensure_default_constraints() if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): m.approximate_likelihood() print m.checkgrad() # Optimize and plot m.optimize() #m.em(plot_all=False) # EM algorithm -m.plot() +m.plot(samples=3) print(m) diff --git a/GPy/examples/poisson.py b/GPy/examples/poisson.py index 5a1cc6af..71d80b30 100644 --- a/GPy/examples/poisson.py +++ b/GPy/examples/poisson.py @@ -31,7 +31,7 @@ Y = F + E pb.plot(X,F,'k-') pb.plot(X,Y,'ro') pb.figure() -likelihood = GPy.inference.likelihoods.poisson(Y,scale=4.) +likelihood = GPy.inference.likelihoods.poisson(Y,scale=6.) m = GPy.models.GP(X,likelihood=likelihood) #m = GPy.models.GP(data['X'],Y=likelihood.Y) diff --git a/GPy/examples/sparse_ep_fix.py b/GPy/examples/sparse_ep_fix.py index 738a82e6..7e3f1fc3 100644 --- a/GPy/examples/sparse_ep_fix.py +++ b/GPy/examples/sparse_ep_fix.py @@ -31,46 +31,18 @@ noise = GPy.kern.white(1) kernel = rbf + noise # create simple GP model -#m1 = GPy.models.sparse_GP_regression(X, Y, kernel, M=M) -m1 = GPy.models.sparse_GP(X, kernel, M=M,likelihood= likelihood) +#m1 = GPy.models.sparse_GP(X, Y, kernel, M=M) +m1 = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood) +print m1.checkgrad() # contrain all parameters to be positive m1.constrain_positive('(variance|lengthscale|precision)') #m1.constrain_positive('(variance|lengthscale)') #m1.constrain_fixed('prec',10.) - #check gradient FIXME unit test please -m1.checkgrad() # optimize and plot m1.optimize('tnc', messages = 1) m1.plot() # print(m1) -###################################### -## 2 dimensional example - -# # sample inputs and outputs -# X = np.random.uniform(-3.,3.,(N,2)) -# Y = np.sin(X[:,0:1]) * np.sin(X[:,1:2])+np.random.randn(N,1)*0.05 - -# # construct kernel -# rbf = GPy.kern.rbf(2) -# noise = GPy.kern.white(2) -# kernel = rbf + noise - -# # create simple GP model -# m2 = GPy.models.sparse_GP_regression(X,Y,kernel, M = 50) -# create simple GP model - -# # contrain all parameters to be positive (but not inducing inputs) -# m2.constrain_positive('(variance|lengthscale|precision)') - -# #check gradient FIXME unit test please -# m2.checkgrad() - -# # optimize and plot -# pb.figure() -# m2.optimize('tnc', messages = 1) -# m2.plot() -# print(m2) diff --git a/GPy/inference/EP.py b/GPy/inference/EP.py index 751d5ca8..5d571888 100644 --- a/GPy/inference/EP.py +++ b/GPy/inference/EP.py @@ -110,7 +110,6 @@ class Full(EP): self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T) self.mu = np.dot(self.Sigma,self.v_tilde) self.iterations += 1 - print self.tau_tilde[i] #TODO erase me #Sigma recomptutation with Cholesky decompositon Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*(self.K) B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K @@ -122,7 +121,13 @@ class Full(EP): epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) - return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None] + + #Variables to be called from GP + mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model + sigma_sum = 1./self.tau_ + 1./self.tau_tilde + mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 + Z_ep = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant + return self.tau_tilde[:,None], mu_tilde[:,None], Z_ep class DTC(EP): def fit_EP(self): diff --git a/GPy/inference/likelihoods.py b/GPy/inference/likelihoods.py index 7f5d9140..864afa57 100644 --- a/GPy/inference/likelihoods.py +++ b/GPy/inference/likelihoods.py @@ -21,6 +21,27 @@ class likelihood: self.location = location self.scale = scale + def plot1D(self,X,mean,var,Z=None,mean_Z=None,var_Z=None,samples=0): + """ + Plot the predictive distribution of the GP model for 1-dimensional inputs + + :param X: The points at which to make a prediction + :param Mean: mean values at X + :param Var: variance values at X + :param Z: Set of points to be highlighted in the plot, i.e. inducing points + :param mean_Z: mean values at Z + :param var_Z: variance values at Z + :samples: Number of samples to plot + """ + assert X.shape[1] == 1, 'Number of dimensions must be 1' + gpplot(X,mean,var.flatten()) + if samples: #NOTE why don't we put samples as a parameter of gpplot + s = np.random.multivariate_normal(mean.flatten(),np.diag(var),samples) + pb.plot(X.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) + #pb.subplot(211) + #self.plot1Da(X,mean,var,Z,mean_Z,var_Z) + + def plot1Da(self,X,mean,var,Z=None,mean_Z=None,var_Z=None): """ Plot the predictive distribution of the GP model for 1-dimensional inputs @@ -37,6 +58,7 @@ class likelihood: pb.errorbar(Z.flatten(),mean_Z.flatten(),2*np.sqrt(var_Z.flatten()),fmt='r+') pb.plot(Z,mean_Z,'ro') + """ def plot1Db(self,X_obs,X,phi,Z=None): assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' gpplot(X,phi,np.zeros(X.shape[0])) @@ -45,6 +67,7 @@ class likelihood: if Z is not None: pb.plot(Z,Z*0+.5,'r|',mew=1.5,markersize=12) + """ def plot2D(self,X,X_new,F_new,U=None): """ Predictive distribution of the fitted GP model for 2-dimensional inputs @@ -98,7 +121,6 @@ class probit(likelihood): sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat) return Z_hat, mu_hat, sigma2_hat - def predictive_mean(self,mu,var): mu = mu.flatten() var = var.flatten() @@ -107,6 +129,14 @@ class probit(likelihood): def _log_likelihood_gradients(): raise NotImplementedError + def plot(self,X,phi,X_obs,Z=None): + assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' + gpplot(X,phi,np.zeros(X.shape[0])) + pb.plot(X_obs,(self.Y+1)/2,'kx',mew=1.5) + if Z is not None: + pb.plot(Z,Z*0+.5,'r|',mew=1.5,markersize=12) + pb.ylim(-0.2,1.2) + class poisson(likelihood): """ Poisson likelihood diff --git a/GPy/models/GP.py b/GPy/models/GP.py index ccfe95c7..3a9f6de8 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -24,13 +24,18 @@ class GP(model): :type normalize_Y: False|True :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) :rtype: model object + :parm likelihood: a GPy likelihood, defaults to gaussian + :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 + :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] + :type powerep: list .. Note:: Multiple independent outputs are allowed using columns of Y """ + #TODO: make beta parameter explicit + #TODO: when using EP, predict needs to return 3 values otherwise it just needs 2. At the moment predict returns 3 values in any case. - def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,epsion_em=.1,power_ep=[1.,1.]): - #TODO: make beta parameter explicit + def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,epsilon_em=.1,power_ep=[1.,1.]): # parse arguments self.Xslices = Xslices @@ -54,7 +59,6 @@ class GP(model): self._Xmean = np.zeros((1,self.X.shape[1])) self._Xstd = np.ones((1,self.X.shape[1])) - # Y - likelihood related variables, these might change whether using EP or not if likelihood is None: assert Y is not None, "Either Y or likelihood must be defined" @@ -68,8 +72,9 @@ class GP(model): if isinstance(self.likelihood,gaussian): self.EP = False self.Y = Y + self.beta = 100.#FIXME beta should be an explicit parameter for this model - #here's some simple normalisation + # Here's some simple normalisation if normalize_Y: self._Ymean = Y.mean(0)[None,:] self._Ystd = Y.std(0)[None,:] @@ -89,50 +94,43 @@ class GP(model): self.EP = True self.eta,self.delta = power_ep self.epsilon_ep = epsilon_ep - self.tau_tilde = np.ones([self.N,self.D]) - self.v_tilde = np.zeros([self.N,self.D]) - self.tau_ = np.ones([self.N,self.D]) - self.v_ = np.zeros([self.N,self.D]) - self.Z_hat = np.ones([self.N,self.D]) + self.beta = np.ones([self.N,self.D]) + self.Z_ep = 0 + self.Y = None + self._Ymean = np.zeros((1,self.D)) + self._Ystd = np.ones((1,self.D)) model.__init__(self) def _set_params(self,p): - # TODO: remove beta when using EP + # TODO: add beta when not using EP self.kern._set_params_transformed(p) - if not self.EP: - self.K = self.kern.K(self.X,slices1=self.Xslices) - self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) - else: - self._ep_covariance() + self.K = self.kern.K(self.X,slices1=self.Xslices) + if self.EP: + self.K += np.diag(1./self.beta.flatten()) + #else: + # self.beta = p[-1] + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) def _get_params(self): - # TODO: remove beta when using EP + # TODO: add beta when not using EP return self.kern._get_params_transformed() def _get_param_names(self): - # TODO: remove beta when using EP + # TODO: add beta when not using EP return self.kern._get_param_names_transformed() def approximate_likelihood(self): assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" - self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) - self.tau_tilde, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP() - # Y: EP likelihood is defined as a regression model for mu_tilde - self.Y = self.v_tilde/self.tau_tilde - self._Ymean = np.zeros((1,self.Y.shape[1])) - self._Ystd = np.ones((1,self.Y.shape[1])) + self.ep_approx = Full(self.K,self.likelihood,epsilon = self.epsilon_ep,power_ep=[self.eta,self.delta]) + self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() if self.D > self.N: # then it's more efficient to store YYT self.YYT = np.dot(self.Y, self.Y.T) else: self.YYT = None - self.mu_ = self.v_/self.tau_ - self._ep_covariance() - - def _ep_covariance(self): # Kernel plus noise variance term - self.K = self.kern.K(self.X,slices1=self.Xslices) + np.diag(1./self.tau_tilde.flatten()) + self.K = self.kern.K(self.X,slices1=self.Xslices) + np.diag(1./self.beta.flatten()) self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) def _model_fit_term(self): @@ -144,25 +142,16 @@ class GP(model): else: return -0.5*np.sum(np.multiply(self.Ki, self.YYT)) - def _normalization_term(self): - """ - Computes the marginal likelihood normalization constants - """ - sigma_sum = 1./self.tau_ + 1./self.tau_tilde - mu_diff_2 = (self.mu_ - self.Y)**2 - penalty_term = np.sum(np.log(self.Z_hat)) - return penalty_term + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) - def log_likelihood(self): """ The log marginal likelihood for an EP model can be written as the log likelihood of a regression model for a new variable Y* = v_tilde/tau_tilde, with a covariance matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ - complexity_term = -0.5*self.D*self.Kplus_logdet - normalization_term = 0 if self.EP == False else self.normalization_term() - return complexity_term + normalization_term + self._model_fit_term() - + L = -0.5*selff.D*self.K_logdet + self.model_fit_term() + if self.EP: + L += self.normalisation_term() + return L def log_likelihood(self): complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet @@ -174,7 +163,6 @@ class GP(model): dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) else: dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki) - return dL_dK def _log_likelihood_gradients(self): @@ -267,7 +255,7 @@ class GP(model): Y = self.Y[which_data,:] Xorig = X*self._Xstd + self._Xmean - Yorig = Y*self._Ystd + self._Ymean if not self.EP else self.likelihood.Y + Yorig = Y*self._Ystd + self._Ymean #NOTE For EP this is v_tilde/beta if plot_limits is None: xmin,xmax = Xorig.min(0),Xorig.max(0) @@ -282,19 +270,17 @@ class GP(model): m,v,phi = self.predict(Xnew,slices=which_functions) if self.EP: pb.subplot(211) - gpplot(Xnew,m,v) - if samples: - s = np.random.multivariate_normal(m.flatten(),v,samples) - pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - if not self.EP: - pb.plot(Xorig,Yorig,'kx',mew=1.5) - pb.xlim(xmin,xmax) - else: - pb.xlim(xmin,xmax) + if samples: #NOTE why don't we put samples as a parameter of gpplot + s = np.random.multivariate_normal(m.flatten(),np.diag(v),samples) + pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) + pb.plot(Xorig,Yorig,'kx',mew=1.5) + pb.xlim(xmin,xmax) + + if self.EP: pb.subplot(212) - self.likelihood.plot1Db(self.X,Xnew,phi) + self.likelihood.plot(Xnew,phi,self.X) pb.xlim(xmin,xmax) elif self.X.shape[1]==2: diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index 1164a1af..655f6252 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -37,7 +37,7 @@ class sparse_GP(GP): :type normalize_(X|Y): bool """ - def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,epsilon_em=.1,power_ep=[1.,1.]): + def __init__(self,X,Y=None,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,epsilon_em=.1,power_ep=[1.,1.]): if Z is None: self.Z = np.random.permutation(X.copy())[:M] @@ -53,10 +53,8 @@ class sparse_GP(GP): self.has_uncertain_inputs=True self.X_uncertainty = X_uncertainty - - self.beta = beta #FIXME - GP.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,epsion_em=epsilon_em,power_ep=power_ep) - self.beta = beta if isinstance(likelihood,gaussian) else self.tau_tilde #TODO this should be defined in GP.__init__ + GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,epsilon_em=epsilon_em,power_ep=power_ep) + self.trYYT = np.sum(np.square(self.Y)) if not self.EP else None #normalise X uncertainty also @@ -74,10 +72,55 @@ class sparse_GP(GP): else: self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) self.kern._set_params(p[self.Z.size:]) - #self._compute_kernel_matrices() this is replaced by _ep_covariance - self._ep_covariance() + #self._compute_kernel_matrices() this is replaced by _ep_kernel_matrices + self._ep_kernel_matrices() self._ep_computations() + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + #TODO: slices for psi statistics (easy enough) + + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + if self.hetero_noise: + raise NotImplementedError, "uncertain ips and het noise not yet supported" + else: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + else: + if self.hetero_noise: + print "rick's stuff here" + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() + self.psi1 = self.kern.K(self.Z,self.X) + self.psi2 = np.dot(self.psi1,self.psi1.T) + + def _computations(self): + # TODO find routine to multiply triangular matrices + self.V = self.beta*self.Y + self.psi1V = np.dot(self.psi1, self.V) + self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) + self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T) + self.B = np.eye(self.M) + self.A + self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) + self.LLambdai = np.dot(self.LBi, self.Lmi) + self.trace_K = self.psi0 - np.trace(self.A)/self.beta + self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) + self.C = mdot(self.LLambdai, self.psi1V) + self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) + + # Compute dL_dpsi + self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) + self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + + # Compute dL_dKmm + self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB + self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE + def approximate_likelihood(self): assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" if self.ep_proxy == 'DTC': @@ -88,6 +131,22 @@ class sparse_GP(GP): else: self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) self.beta, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP() + self._ep_kernel_matrices() + self._computations() + + def _ep_kernel_matrices(self): + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) #FIXME include beta + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices) + self.psi1 = self.kern.K(self.Z,self.X) + self.psi2 = np.dot(self.psi1,self.psi1.T) + self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T) + + def _ep_computations(self): # Y: EP likelihood is defined as a regression model for mu_tilde self.Y = self.v_tilde/self.beta self._Ymean = np.zeros((1,self.Y.shape[1])) @@ -99,50 +158,17 @@ class sparse_GP(GP): else: self.YYT = None self.mu_ = self.v_/self.tau_ - self._ep_covariance() - self._computations() - - def _ep_covariance(self): - self.Kmm = self.kern.K(self.Z) - if self.has_uncertain_inputs: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() - self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) #FIXME include beta - else: - #self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() - self.Knn_diag = self.kern.Kdiag(self.X,slices=self.Xslices) - self.psi0 = (self.beta*self.Knn_diag).sum() #TODO check dimensions - self.psi1 = self.kern.K(self.Z,self.X) - #self.psi2 = np.dot(self.psi1,self.psi1.T) - self.psi2 = np.dot(self.psi1,self.beta*self.psi1.T) - - def _compute_kernel_matrices(self): - # kernel computations, using BGPLVM notation - #TODO: slices for psi statistics (easy enough) - - self.Kmm = self.kern.K(self.Z) - if self.has_uncertain_inputs: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() - self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) - else: - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() - self.psi1 = self.kern.K(self.Z,self.X) - self.psi2 = np.dot(self.psi1,self.psi1.T) - - def _ep_computations(self): # TODO find routine to multiply triangular matrices self.V = self.beta*self.Y self.psi1V = np.dot(self.psi1, self.V) self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) #self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T) - self.A = mdot(self.Lmi, self.psi2, self.Lmi.T) + self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) self.B = np.eye(self.M) + self.A self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) self.LLambdai = np.dot(self.LBi, self.Lmi) - #self.trace_K = self.psi0 - np.sum(np.dot(self.Lmi,self.psi1)**2,-1) #TODO check - self.trace_K = self.psi0 - np.trace(self.A) + self.trace_K = self.psi0.sum() - np.trace(self.A) self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) self.C = mdot(self.LLambdai, self.psi1V) self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) @@ -176,10 +202,15 @@ class sparse_GP(GP): Compute the (lower bound on the) log marginal likelihood """ beta_logdet = self.N*self.D*np.log(self.beta) if not self.EP else self.D*np.sum(np.log(self.beta)) - A = -0.5*self.N*self.D*(np.log(2.*np.pi)) - 0.5*beta_logdet - B = -0.5*self.beta*self.D*self.trace_K if not self.EP else -0.5*self.D*self.trace_K + if self.hetero_noise: + A = foo + B = bar + D = -0.5*self.trbetaYYT + else: + A = -0.5*self.N*self.D*(np.log(2.*np.pi)) - 0.5*beta_logdet + B = -0.5*self.beta*self.D*self.trace_K if not self.EP else -0.5*self.D*self.trace_K + D = -0.5*self.beta*self.trYYT C = -0.5*self.D * self.B_logdet - D = -0.5*self.beta*self.trYYT if not self.EP else -0.5*self.trbetaYYT E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) return A+B+C+D+E @@ -243,13 +274,14 @@ class sparse_GP(GP): noise_term = 1./self.beta if not self.EP else 0 Kxx = self.kern.Kdiag(Xnew) var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) + noise_term - return mu,var + return mu,var,None#TODO add phi for EP def plot(self, *args, **kwargs): """ Plot the fitted model: just call the GP_regression plot function and then add inducing inputs """ - GP_regression.plot(self,*args,**kwargs) + #GP_regression.plot(self,*args,**kwargs) + GP.plot(self,*args,**kwargs) if self.Q==1: pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) if self.has_uncertain_inputs: From 7737cecf6db40188ceaf626e2287d380c6705e0e Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Mon, 28 Jan 2013 18:01:55 +0000 Subject: [PATCH 06/44] EM algorithm --- GPy/examples/ep_fix.py | 1 + GPy/models/GP.py | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index c4e025dd..49ebd5aa 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -35,5 +35,6 @@ print m.checkgrad() m.optimize() #m.em(plot_all=False) # EM algorithm m.plot(samples=3) +m.EM() print(m) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 3a9f6de8..51da0490 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -229,6 +229,33 @@ class GP(model): phi = None if not self.EP else self.likelihood.predictive_mean(mu,var) return mu, var, phi + def EM(self,max_f_eval=20,epsilon=.1,plot_all=False): #TODO check this makes sense + """ + Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved. + """ + self.epsilon_em = epsilon + log_likelihood_change = self.epsilon_em + 1. + self.parameters_path = [self._get_params()] + self.approximate_likelihood() + self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]] + self.log_likelihood_path = [self.log_likelihood()] + iteration = 0 + while log_likelihood_change > self.epsilon_em: + print 'EM iteration', iteration + self.optimize(max_f_eval = max_f_eval) + log_likelihood_new = self.log_likelihood() + log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1] + if log_likelihood_change < 0: + print 'log_likelihood decrement' + self._set_params(self.parameters_path[-1]) + self.kern._set_params(self.parameters_path[-1]) + else: + self.approximate_likelihood() + self.log_likelihood_path.append(self.log_likelihood()) + self.parameters_path.append(self._get_params()) + self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde]) + iteration += 1 + def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): """ :param samples: the number of a posteriori samples to plot From d9a3226f4989c15ccb1f23b3daf6c76db5c46b8e Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 12:06:34 +0000 Subject: [PATCH 07/44] EM algorithm for EP. --- GPy/core/model.py | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index 4a1791bd..b6b280a1 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -381,6 +381,43 @@ class model(parameterised): print grad_string print '' - return False return True + + def EM(self,epsilon=.1,**kwargs): + """ + Expectation maximization for Expectation Propagation. + + kwargs are passed to the optimize function. They can be: + + :epsilon: convergence criterion + :max_f_eval: maximum number of function evaluations + :messages: whether to display during optimisation + :param optimzer: whice optimizer to use (defaults to self.preferred optimizer) + :type optimzer: string TODO: valid strings? + + """ + assert self.EP, "EM not available for gaussian likelihood" + log_change = epsilon + 1. + self.log_likelihood_record = [] + self.gp_params_record = [] + self.ep_params_record = [] + iteration = 0 + last_value = -np.exp(1000) + while log_change > epsilon or not iteration: + print 'EM iteration %s' %iteration + self.approximate_likelihood() + self.optimize(**kwargs) + new_value = self.log_likelihood() + log_change = new_value - last_value + if log_change > epsilon: + self.log_likelihood_record.append(new_value) + self.gp_params_record.append(self._get_params()) + self.ep_params_record.append((self.beta,self.Y,self.Z_ep)) + last_value = new_value + else: + convergence = False + self.beta, self.Y, self.Z_ep = self.ep_params_record[-1] + self._set_params(self.gp_params_record[-1]) + print "Log-likelihood decrement: %s \nLast iteration discarded." %log_change + iteration += 1 From 691aeeaf22ca28f28190af3ce8ba02d0d0205e94 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 12:07:19 +0000 Subject: [PATCH 08/44] GP model works now. --- GPy/models/GP.py | 36 +++++------------------------------- 1 file changed, 5 insertions(+), 31 deletions(-) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 51da0490..95145978 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -13,7 +13,7 @@ from ..inference.likelihoods import likelihood,probit,poisson,gaussian class GP(model): """ - Gaussian Process model for regression + Gaussian Process model for regression and EP :param X: input observations :param Y: observed values @@ -35,7 +35,7 @@ class GP(model): #TODO: make beta parameter explicit #TODO: when using EP, predict needs to return 3 values otherwise it just needs 2. At the moment predict returns 3 values in any case. - def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,epsilon_em=.1,power_ep=[1.,1.]): + def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,power_ep=[1.,1.]): # parse arguments self.Xslices = Xslices @@ -121,6 +121,9 @@ class GP(model): return self.kern._get_param_names_transformed() def approximate_likelihood(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + """ assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" self.ep_approx = Full(self.K,self.likelihood,epsilon = self.epsilon_ep,power_ep=[self.eta,self.delta]) self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() @@ -170,7 +173,6 @@ class GP(model): def predict(self,Xnew, slices=None, full_cov=False): """ - Predict the function(s) at the new point(s) Xnew. Arguments @@ -193,7 +195,6 @@ class GP(model): If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. This is to allow for different normalisations of the output dimensions. - """ #normalise X values @@ -229,33 +230,6 @@ class GP(model): phi = None if not self.EP else self.likelihood.predictive_mean(mu,var) return mu, var, phi - def EM(self,max_f_eval=20,epsilon=.1,plot_all=False): #TODO check this makes sense - """ - Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved. - """ - self.epsilon_em = epsilon - log_likelihood_change = self.epsilon_em + 1. - self.parameters_path = [self._get_params()] - self.approximate_likelihood() - self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]] - self.log_likelihood_path = [self.log_likelihood()] - iteration = 0 - while log_likelihood_change > self.epsilon_em: - print 'EM iteration', iteration - self.optimize(max_f_eval = max_f_eval) - log_likelihood_new = self.log_likelihood() - log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1] - if log_likelihood_change < 0: - print 'log_likelihood decrement' - self._set_params(self.parameters_path[-1]) - self.kern._set_params(self.parameters_path[-1]) - else: - self.approximate_likelihood() - self.log_likelihood_path.append(self.log_likelihood()) - self.parameters_path.append(self._get_params()) - self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde]) - iteration += 1 - def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): """ :param samples: the number of a posteriori samples to plot From 9972862ea22164a89e05b1667a45cbadf8d780e9 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 12:08:50 +0000 Subject: [PATCH 09/44] Test file. --- GPy/examples/ep_fix.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index 49ebd5aa..1d7b4741 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -26,15 +26,14 @@ likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) m = GPy.models.GP(data['X'],likelihood=likelihood) #m = GPy.models.GP(data['X'],likelihood.Y) - m.ensure_default_constraints() + +# Optimize and plot if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): m.approximate_likelihood() -print m.checkgrad() -# Optimize and plot -m.optimize() -#m.em(plot_all=False) # EM algorithm -m.plot(samples=3) +#m.optimize() m.EM() +print m.log_likelihood() +m.plot(samples=3) print(m) From 01f0378f840821fdac8acc0652be213ef77a536f Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 12:23:49 +0000 Subject: [PATCH 10/44] Other change. --- GPy/examples/ep_fix.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index 1d7b4741..8041cc91 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -29,8 +29,8 @@ m = GPy.models.GP(data['X'],likelihood=likelihood) m.ensure_default_constraints() # Optimize and plot -if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): - m.approximate_likelihood() +#if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): +# m.approximate_likelihood() #m.optimize() m.EM() From dbf920ebd5f72746876ce9f54efa4ac7401e25a9 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 12:29:22 +0000 Subject: [PATCH 11/44] Minor change in EM explanation. --- GPy/core/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index b6b280a1..ccfbf298 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -397,7 +397,7 @@ class model(parameterised): :type optimzer: string TODO: valid strings? """ - assert self.EP, "EM not available for gaussian likelihood" + assert self.EP, "EM is not available for gaussian likelihood" log_change = epsilon + 1. self.log_likelihood_record = [] self.gp_params_record = [] From 217fa0e70eaad1eecc3ef77f541a03435ad7ef50 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 16:44:12 +0000 Subject: [PATCH 12/44] Now it works. --- GPy/examples/poisson.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/GPy/examples/poisson.py b/GPy/examples/poisson.py index 71d80b30..e15f310d 100644 --- a/GPy/examples/poisson.py +++ b/GPy/examples/poisson.py @@ -25,16 +25,14 @@ seed=default_seed """ X = np.arange(0,100,5)[:,None] -F = np.round(np.sin(X/18.) + .1*X) -E = np.random.randint(-3,3,20)[:,None] +F = np.round(np.sin(X/18.) + .1*X) + np.arange(5,25)[:,None] +E = np.random.randint(-5,5,20)[:,None] Y = F + E -pb.plot(X,F,'k-') -pb.plot(X,Y,'ro') pb.figure() -likelihood = GPy.inference.likelihoods.poisson(Y,scale=6.) +likelihood = GPy.inference.likelihoods.poisson(Y,scale=1.) m = GPy.models.GP(X,likelihood=likelihood) -#m = GPy.models.GP(data['X'],Y=likelihood.Y) +#m = GPy.models.GP(X,Y=likelihood.Y) m.constrain_positive('var') m.constrain_positive('len') From cab3b77b6b4963415ee3e0143a650e560478ddb5 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 16:44:42 +0000 Subject: [PATCH 13/44] Assertions included. --- GPy/inference/likelihoods.py | 84 ++++++++++-------------------------- 1 file changed, 22 insertions(+), 62 deletions(-) diff --git a/GPy/inference/likelihoods.py b/GPy/inference/likelihoods.py index 864afa57..b170dc3d 100644 --- a/GPy/inference/likelihoods.py +++ b/GPy/inference/likelihoods.py @@ -9,65 +9,18 @@ import pylab as pb from ..util.plot import gpplot class likelihood: - def __init__(self,Y,location=0,scale=1): - """ - Likelihood class for doing Expectation propagation + """ + Likelihood class for doing Expectation propagation - :param Y: observed output (Nx1 numpy.darray) - ..Note:: Y values allowed depend on the likelihood used - """ + :param Y: observed output (Nx1 numpy.darray) + ..Note:: Y values allowed depend on the likelihood used + """ + def __init__(self,Y,location=0,scale=1): self.Y = Y self.N = self.Y.shape[0] self.location = location self.scale = scale - def plot1D(self,X,mean,var,Z=None,mean_Z=None,var_Z=None,samples=0): - """ - Plot the predictive distribution of the GP model for 1-dimensional inputs - - :param X: The points at which to make a prediction - :param Mean: mean values at X - :param Var: variance values at X - :param Z: Set of points to be highlighted in the plot, i.e. inducing points - :param mean_Z: mean values at Z - :param var_Z: variance values at Z - :samples: Number of samples to plot - """ - assert X.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X,mean,var.flatten()) - if samples: #NOTE why don't we put samples as a parameter of gpplot - s = np.random.multivariate_normal(mean.flatten(),np.diag(var),samples) - pb.plot(X.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - #pb.subplot(211) - #self.plot1Da(X,mean,var,Z,mean_Z,var_Z) - - - def plot1Da(self,X,mean,var,Z=None,mean_Z=None,var_Z=None): - """ - Plot the predictive distribution of the GP model for 1-dimensional inputs - - :param X_new: The points at which to make a prediction - :param Mean_new: mean values at X_new - :param Var_new: variance values at X_new - :param X_u: input (inducing) points used to train the model - :param Mean_u: mean values at X_u - :param Var_new: variance values at X_u - """ - assert X.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X,mean,var.flatten()) - pb.errorbar(Z.flatten(),mean_Z.flatten(),2*np.sqrt(var_Z.flatten()),fmt='r+') - pb.plot(Z,mean_Z,'ro') - - """ - def plot1Db(self,X_obs,X,phi,Z=None): - assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X,phi,np.zeros(X.shape[0])) - pb.plot(X_obs,(self.Y+1)/2,'kx',mew=1.5) - pb.ylim(-0.2,1.2) - if Z is not None: - pb.plot(Z,Z*0+.5,'r|',mew=1.5,markersize=12) - - """ def plot2D(self,X,X_new,F_new,U=None): """ Predictive distribution of the fitted GP model for 2-dimensional inputs @@ -106,6 +59,10 @@ class probit(likelihood): L(x) = \\Phi (Y_i*f_i) $$ """ + def __init__(self,Y,location=0,scale=1): + assert np.sum(np.abs(Y)-1) == 0, "Output values must be either -1 or 1" + likelihood.__init__(self,Y,location,scale) + def moments_match(self,i,tau_i,v_i): """ Moments match of the marginal approximation in EP algorithm @@ -146,6 +103,10 @@ class poisson(likelihood): L(x) = \exp(\lambda) * \lambda**Y_i / Y_i! $$ """ + def __init__(self,Y,location=0,scale=1): + assert len(Y[Y<0]) == 0, "Output cannot have negative values" + likelihood.__init__(self,Y,location,scale) + def moments_match(self,i,tau_i,v_i): """ Moments match of the marginal approximation in EP algorithm @@ -203,20 +164,19 @@ class poisson(likelihood): sigma2_hat = m2 - mu_hat**2 # Second central moment return float(Z_hat), float(mu_hat), float(sigma2_hat) - def plot1Db(self,X,X_new,F_new,F2_new=None,U=None): - pb.subplot(212) - #gpplot(X_new,F_new,np.sqrt(F2_new)) - pb.plot(X_new,F_new)#,np.sqrt(F2_new)) #FIXME - pb.plot(X,self.Y,'kx',mew=1.5) - if U is not None: - pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12) def predictive_mean(self,mu,variance): return np.exp(mu*self.scale + self.location) - def predictive_variance(self,mu,variance): - return mu + def _log_likelihood_gradients(): raise NotImplementedError + def plot(self,X,phi,X_obs,Z=None): + assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' + gpplot(X,phi,np.zeros(X.shape[0])) + pb.plot(X_obs,self.Y,'kx',mew=1.5) + if Z is not None: + pb.plot(Z,Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) + class gaussian(likelihood): """ Gaussian likelihood From ec89c4efc300b7e3e5622c6cd018d6fe7deda55b Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 16:45:00 +0000 Subject: [PATCH 14/44] _compute_GP_variables --- GPy/inference/EP.py | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/GPy/inference/EP.py b/GPy/inference/EP.py index 5d571888..5c473a8f 100644 --- a/GPy/inference/EP.py +++ b/GPy/inference/EP.py @@ -48,13 +48,13 @@ class EP: self.tau_tilde = np.zeros(self.N) self.v_tilde = np.zeros(self.N) - def restart_EP(self): - """ - Set the EP approximation to initial state - """ - self.tau_tilde = np.zeros(self.N) - self.v_tilde = np.zeros(self.N) - self.mu = np.zeros(self.N) + def _compute_GP_variables(self): + #Variables to be called from GP + mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model + sigma_sum = 1./self.tau_ + 1./self.tau_tilde + mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 + Z_ep = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant + return self.tau_tilde[:,None], mu_tilde[:,None], Z_ep class Full(EP): def fit_EP(self): @@ -122,12 +122,7 @@ class Full(EP): self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) - #Variables to be called from GP - mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model - sigma_sum = 1./self.tau_ + 1./self.tau_tilde - mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 - Z_ep = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant - return self.tau_tilde[:,None], mu_tilde[:,None], Z_ep + return self._compute_GP_variables() class DTC(EP): def fit_EP(self): @@ -212,7 +207,8 @@ class DTC(EP): epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) - return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None] + + return self._compute_GP_variables() class FITC(EP): def fit_EP(self): @@ -313,4 +309,5 @@ class FITC(EP): epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) - return self.tau_tilde[:,None], self.v_tilde[:,None], self.Z_hat[:,None], self.tau_[:,None], self.v_[:,None] + + return self._compute_GP_variables() From 0a88df62c3bd9d7bac7b183ca316e666a452438b Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 16:45:31 +0000 Subject: [PATCH 15/44] Minor changes. --- GPy/models/sparse_GP.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index 655f6252..f5381eed 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -35,9 +35,13 @@ class sparse_GP(GP): :type beta: float :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) :type normalize_(X|Y): bool + :parm likelihood: a GPy likelihood, defaults to gaussian + :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 + :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] + :type powerep: list """ - def __init__(self,X,Y=None,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,epsilon_em=.1,power_ep=[1.,1.]): + def __init__(self,X,Y=None,kernel=None,X_uncertainty=None,beta=100.,Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,power_ep=[1.,1.]): if Z is None: self.Z = np.random.permutation(X.copy())[:M] @@ -53,7 +57,7 @@ class sparse_GP(GP): self.has_uncertain_inputs=True self.X_uncertainty = X_uncertainty - GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,epsilon_em=epsilon_em,power_ep=power_ep) + GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,power_ep=power_ep) self.trYYT = np.sum(np.square(self.Y)) if not self.EP else None @@ -91,6 +95,9 @@ class sparse_GP(GP): else: if self.hetero_noise: print "rick's stuff here" + + + else: self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() self.psi1 = self.kern.K(self.Z,self.X) From bb1e0021d7ae0ebd5d06ec19e2f6b47d02d240c9 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Tue, 29 Jan 2013 18:01:47 +0000 Subject: [PATCH 16/44] More changes. --- GPy/models/GP.py | 4 +- GPy/models/sparse_GP.py | 136 ++++++++++++++-------------------------- 2 files changed, 48 insertions(+), 92 deletions(-) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 95145978..4d80ab87 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -73,7 +73,6 @@ class GP(model): self.EP = False self.Y = Y self.beta = 100.#FIXME beta should be an explicit parameter for this model - # Here's some simple normalisation if normalize_Y: self._Ymean = Y.mean(0)[None,:] @@ -88,8 +87,9 @@ class GP(model): self.YYT = np.dot(self.Y, self.Y.T) else: self.YYT = None - else: + if self.D > 1: + raise NotImplementedError, "EP is not implemented for D > 1" # Y is defined after approximating the likelihood self.EP = True self.eta,self.delta = power_ep diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index f5381eed..ea1ba100 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -60,48 +60,52 @@ class sparse_GP(GP): GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,power_ep=power_ep) self.trYYT = np.sum(np.square(self.Y)) if not self.EP else None - #normalise X uncertainty also if self.has_uncertain_inputs: self.X_uncertainty /= np.square(self._Xstd) def _set_params(self, p): + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) if not self.EP: - self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) - self.beta = p[self.M*self.Q] + #self.beta = p[self.M*self.Q] + self.beta = np.repeat(p[self.M*self.Q],self.N)[:,None] self.kern._set_params(p[self.Z.size + 1:]) self.beta2 = self.beta**2 - self._compute_kernel_matrices() - self._computations() else: - self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) self.kern._set_params(p[self.Z.size:]) - #self._compute_kernel_matrices() this is replaced by _ep_kernel_matrices - self._ep_kernel_matrices() - self._ep_computations() + if self.Y is None: + self.Y = np.ones([self.N,1]) + self._compute_kernel_matrices() + self._computations() + + def _get_params(self): + if not self.EP: + return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()]) + else: + return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()]) + + def _get_param_names(self): + if not self.EP: + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed() + else: + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed() def _compute_kernel_matrices(self): # kernel computations, using BGPLVM notation #TODO: slices for psi statistics (easy enough) - self.Kmm = self.kern.K(self.Z) if self.has_uncertain_inputs: - if self.hetero_noise: - raise NotImplementedError, "uncertain ips and het noise not yet supported" - else: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() + if not self.EP: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)#.sum() self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) - else: - if self.hetero_noise: - print "rick's stuff here" - - - + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)#FIXME add beta vector else: - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() - self.psi1 = self.kern.K(self.Z,self.X) - self.psi2 = np.dot(self.psi1,self.psi1.T) + raise NotImplementedError, "uncertain_inputs not yet supported for EP" + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)#.sum() FIXME + self.psi1 = self.kern.K(self.Z,self.X) + self.psi2 = np.dot(self.psi1,self.psi1.T) + self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T) def _computations(self): # TODO find routine to multiply triangular matrices @@ -109,17 +113,17 @@ class sparse_GP(GP): self.psi1V = np.dot(self.psi1, self.V) self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T) + self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) self.B = np.eye(self.M) + self.A self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) self.LLambdai = np.dot(self.LBi, self.Lmi) - self.trace_K = self.psi0 - np.trace(self.A)/self.beta + self.trace_K = self.psi0.sum() - np.trace(self.A) self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) self.C = mdot(self.LLambdai, self.psi1V) self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) # Compute dL_dpsi - self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones([self.N,1]) self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) @@ -133,76 +137,28 @@ class sparse_GP(GP): if self.ep_proxy == 'DTC': self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) elif self.ep_proxy == 'FITC': - self.Knn_diag = self.kern.psi0(self.Z,self.X, self.X_uncertainty) #TODO psi0 already calculates this - self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.Knn_diag,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.psi0,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) else: self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) - self.beta, self.v_tilde, self.Z_hat, self.tau_, self.v_=self.ep_approx.fit_EP() - self._ep_kernel_matrices() + self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() self._computations() - def _ep_kernel_matrices(self): - self.Kmm = self.kern.K(self.Z) - if self.has_uncertain_inputs: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() - self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) #FIXME include beta - else: - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices) - self.psi1 = self.kern.K(self.Z,self.X) - self.psi2 = np.dot(self.psi1,self.psi1.T) - self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T) - - def _ep_computations(self): - # Y: EP likelihood is defined as a regression model for mu_tilde - self.Y = self.v_tilde/self.beta - self._Ymean = np.zeros((1,self.Y.shape[1])) - self._Ystd = np.ones((1,self.Y.shape[1])) - self.trbetaYYT = np.sum(self.beta*np.square(self.Y)) - if self.D > self.N: - # then it's more efficient to store YYT - self.YYT = np.dot(self.Y, self.Y.T) - else: - self.YYT = None - self.mu_ = self.v_/self.tau_ - # TODO find routine to multiply triangular matrices - self.V = self.beta*self.Y - self.psi1V = np.dot(self.psi1, self.V) - self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - #self.A = mdot(self.Lmi, self.beta*self.psi2, self.Lmi.T) - self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) - self.B = np.eye(self.M) + self.A - self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) - self.LLambdai = np.dot(self.LBi, self.Lmi) - self.trace_K = self.psi0.sum() - np.trace(self.A) - self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) - self.C = mdot(self.LLambdai, self.psi1V) - self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) - - # Compute dL_dpsi - #self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) - self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() * np.ones(self.N) #TODO check - self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) - #self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) - self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) - - # Compute dL_dKmm - self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB - self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC - self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE - - def _get_params(self): + def log_likelihood(self): + """ + Compute the (lower bound on the) log marginal likelihood + """ if not self.EP: - return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()]) + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) else: - return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()]) + A = -0.5*self.D*(self.N*np.log(2.*np.pi) - np.sum(np.log(self.beta))) + B = -0.5*self.D*self.trace_K + C = -0.5*self.D * self.B_logdet + D = -0.5*self.beta*self.trYYT + E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) + return A+B+C+D+E + + - def _get_param_names(self): - if not self.EP: - return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed() - else: - return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed() def log_likelihood(self): """ From d1a0883c12f49bc25956812b4dcdfc0c66ca3b3b Mon Sep 17 00:00:00 2001 From: Ricardo Date: Tue, 29 Jan 2013 23:54:02 +0000 Subject: [PATCH 17/44] Log-likelihood,predictions and plotting are working. --- GPy/examples/sparse_ep_fix.py | 19 ++++---- GPy/inference/EP.py | 4 +- GPy/models/GP.py | 16 ++++--- GPy/models/sparse_GP.py | 81 +++++++++++++++++++---------------- 4 files changed, 64 insertions(+), 56 deletions(-) diff --git a/GPy/examples/sparse_ep_fix.py b/GPy/examples/sparse_ep_fix.py index 7e3f1fc3..f2c25898 100644 --- a/GPy/examples/sparse_ep_fix.py +++ b/GPy/examples/sparse_ep_fix.py @@ -31,18 +31,17 @@ noise = GPy.kern.white(1) kernel = rbf + noise # create simple GP model -#m1 = GPy.models.sparse_GP(X, Y, kernel, M=M) -m1 = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood) +m = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood) +#m = GPy.models.sparse_GP(X, Y, kernel, M=M) -print m1.checkgrad() # contrain all parameters to be positive -m1.constrain_positive('(variance|lengthscale|precision)') -#m1.constrain_positive('(variance|lengthscale)') -#m1.constrain_fixed('prec',10.) - +m.ensure_default_constraints() +if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): + m.approximate_likelihood() +print m.checkgrad() #check gradient FIXME unit test please # optimize and plot -m1.optimize('tnc', messages = 1) -m1.plot() -# print(m1) +#m.optimize('tnc', messages = 1) +m.plot(samples=3,full_cov=False) +# print(m) diff --git a/GPy/inference/EP.py b/GPy/inference/EP.py index 5c473a8f..c3aad7c1 100644 --- a/GPy/inference/EP.py +++ b/GPy/inference/EP.py @@ -136,7 +136,7 @@ class DTC(EP): q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) Sigma0 = Qnn = Knm*Kmmi*Kmn """ - self.Kmmi, self.Kmm_hld = pdinv(self.Kmm) + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) self.KmnKnm = np.dot(self.Kmn, self.Kmn.T) self.KmmiKmn = np.dot(self.Kmmi,self.Kmn) self.Qnn_diag = np.sum(self.Kmn*self.KmmiKmn,-2) @@ -222,7 +222,7 @@ class FITC(EP): q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn """ - self.Kmmi, self.Kmm_hld = pdinv(self.Kmm) + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) self.P0 = self.Kmn.T self.KmnKnm = np.dot(self.P0.T, self.P0) self.KmmiKmn = np.dot(self.Kmmi,self.P0.T) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 4d80ab87..482143d6 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -196,7 +196,6 @@ class GP(model): This is to allow for different normalisations of the output dimensions. """ - #normalise X values Xnew = (Xnew.copy() - self._Xmean) / self._Xstd mu, var, phi = self._raw_predict(Xnew, slices, full_cov) @@ -224,13 +223,18 @@ class GP(model): if full_cov: Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) var = Kxx - np.dot(KiKx.T,Kx) + if self.EP: + raise NotImplementedError, "full_cov = True not implemented for EP" + #var = np.diag(var)[:,None] + #phi = self.likelihood.predictive_mean(mu,var) else: Kxx = self.kern.Kdiag(_Xnew, slices=slices) var = Kxx - np.sum(np.multiply(KiKx,Kx),0) - phi = None if not self.EP else self.likelihood.predictive_mean(mu,var) + if self.EP: + phi = self.likelihood.predictive_mean(mu,var) return mu, var, phi - def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): + def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): """ :param samples: the number of a posteriori samples to plot :param which_data: which if the training data to plot (default all) @@ -268,13 +272,13 @@ class GP(model): if self.X.shape[1]==1: Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - m,v,phi = self.predict(Xnew,slices=which_functions) + m,v,phi = self.predict(Xnew,slices=which_functions,full_cov=full_cov) if self.EP: pb.subplot(211) gpplot(Xnew,m,v) if samples: #NOTE why don't we put samples as a parameter of gpplot - s = np.random.multivariate_normal(m.flatten(),np.diag(v),samples) + s = np.random.multivariate_normal(m.flatten(),np.diag(v.flatten()),samples) pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) pb.plot(Xorig,Yorig,'kx',mew=1.5) pb.xlim(xmin,xmax) @@ -288,7 +292,7 @@ class GP(model): resolution = 50 or resolution xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] Xtest = np.vstack((xx.flatten(),yy.flatten())).T - zz,vv,phi = self.predict(Xtest,slices=which_functions) + zz,vv,phi = self.predict(Xtest,slices=which_functions,full_cov=full_cov) zz = zz.reshape(resolution,resolution) pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index ea1ba100..8b1b6fb9 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -7,7 +7,7 @@ from ..util.linalg import mdot, jitchol, chol_inv, pdinv from ..util.plot import gpplot from .. import kern from GP import GP -from ..inference.EP import Full +from ..inference.EP import Full,DTC,FITC from ..inference.likelihoods import likelihood,probit,poisson,gaussian #Still TODO: @@ -36,6 +36,8 @@ class sparse_GP(GP): :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) :type normalize_(X|Y): bool :parm likelihood: a GPy likelihood, defaults to gaussian + :param method_ep: sparse approximation used by Expectation Propagation algorithm, defaults to DTC + :type M: string (Full|DTC|FITC) :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] :type powerep: list @@ -58,17 +60,22 @@ class sparse_GP(GP): self.X_uncertainty = X_uncertainty GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,power_ep=power_ep) - self.trYYT = np.sum(np.square(self.Y)) if not self.EP else None #normalise X uncertainty also if self.has_uncertain_inputs: self.X_uncertainty /= np.square(self._Xstd) + if not self.EP: + self.trYYT = np.sum(np.square(self.Y)) + else: + self.method_ep = method_ep + + def _set_params(self, p): self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) if not self.EP: - #self.beta = p[self.M*self.Q] - self.beta = np.repeat(p[self.M*self.Q],self.N)[:,None] + self.beta = p[self.M*self.Q] + #self.beta = np.repeat(p[self.M*self.Q],self.N)[:,None] self.kern._set_params(p[self.Z.size + 1:]) self.beta2 = self.beta**2 else: @@ -76,7 +83,7 @@ class sparse_GP(GP): if self.Y is None: self.Y = np.ones([self.N,1]) self._compute_kernel_matrices() - self._computations() + self._computations() #NOTE At this point computations of dL are not needed def _get_params(self): if not self.EP: @@ -123,24 +130,29 @@ class sparse_GP(GP): self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) # Compute dL_dpsi - self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones([self.N,1]) + self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() * np.ones(self.N) self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) - self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + #self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + self.dL_dpsi2 = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) # Compute dL_dKmm self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB - self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC - self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE + #self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*mdot(self.LBL_inv, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC + #self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE + self.dL_dKmm += np.dot(np.dot(self.G,self.psi2_beta_scaled) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE def approximate_likelihood(self): assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" - if self.ep_proxy == 'DTC': + if self.method_ep == 'DTC': self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) - elif self.ep_proxy == 'FITC': + elif self.method_ep == 'FITC': self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.psi0,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) else: self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() + print "Aqui toy" + self.trbetaYYT = np.sum(np.square(self.Y)*self.beta) self._computations() def log_likelihood(self): @@ -149,30 +161,11 @@ class sparse_GP(GP): """ if not self.EP: A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) + D = -0.5*self.beta*self.trYYT else: A = -0.5*self.D*(self.N*np.log(2.*np.pi) - np.sum(np.log(self.beta))) - B = -0.5*self.D*self.trace_K - C = -0.5*self.D * self.B_logdet - D = -0.5*self.beta*self.trYYT - E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) - return A+B+C+D+E - - - - - def log_likelihood(self): - """ - Compute the (lower bound on the) log marginal likelihood - """ - beta_logdet = self.N*self.D*np.log(self.beta) if not self.EP else self.D*np.sum(np.log(self.beta)) - if self.hetero_noise: - A = foo - B = bar D = -0.5*self.trbetaYYT - else: - A = -0.5*self.N*self.D*(np.log(2.*np.pi)) - 0.5*beta_logdet - B = -0.5*self.beta*self.D*self.trace_K if not self.EP else -0.5*self.D*self.trace_K - D = -0.5*self.beta*self.trYYT + B = -0.5*self.D*self.trace_K C = -0.5*self.D * self.B_logdet E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) return A+B+C+D+E @@ -223,21 +216,33 @@ class sparse_GP(GP): return dL_dZ def _log_likelihood_gradients(self): - return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) + if not self.EP: + return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) + else: + return np.hstack([self.dL_dZ().flatten(), self.dL_dtheta()]) def _raw_predict(self, Xnew, slices, full_cov=False): """Internal helper function for making predictions, does not account for normalisation""" Kx = self.kern.K(self.Z, Xnew) mu = mdot(Kx.T, self.LBL_inv, self.psi1V) + phi = None if full_cov: - noise_term = np.eye(Xnew.shape[0])/self.beta if not self.EP else 0 Kxx = self.kern.K(Xnew) - var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + noise_term + var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + if not self.EP: + var += np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case. + else: + raise NotImplementedError, "full_cov = True not implemented for EP" + #var = np.diag(var)[:,None] + #phi = self.likelihood.predictive_mean(mu,var) else: - noise_term = 1./self.beta if not self.EP else 0 Kxx = self.kern.Kdiag(Xnew) - var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) + noise_term - return mu,var,None#TODO add phi for EP + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) + if not self.EP: + var += 1./self.beta # TODO: This beta doesn't belong here in the EP case. + else: + phi = self.likelihood.predictive_mean(mu,var) + return mu,var,phi def plot(self, *args, **kwargs): """ From 29eb61d65efd224dc63b9141f7361d437119a3f3 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Wed, 30 Jan 2013 12:14:32 +0000 Subject: [PATCH 18/44] EP plots samples now for the phi transformation. --- GPy/examples/poisson.py | 2 +- GPy/examples/sparse_ep_fix.py | 2 ++ GPy/inference/likelihoods.py | 24 +++++++++++++++++++----- GPy/models/GP.py | 2 +- GPy/models/sparse_GP.py | 1 - 5 files changed, 23 insertions(+), 8 deletions(-) diff --git a/GPy/examples/poisson.py b/GPy/examples/poisson.py index e15f310d..934637f1 100644 --- a/GPy/examples/poisson.py +++ b/GPy/examples/poisson.py @@ -43,6 +43,6 @@ print m.checkgrad() # Optimize and plot m.optimize() #m.em(plot_all=False) # EM algorithm -m.plot() +m.plot(samples=4) print(m) diff --git a/GPy/examples/sparse_ep_fix.py b/GPy/examples/sparse_ep_fix.py index f2c25898..ff90f2bb 100644 --- a/GPy/examples/sparse_ep_fix.py +++ b/GPy/examples/sparse_ep_fix.py @@ -14,6 +14,7 @@ pb.ion() N = 500 M = 5 +pb.close('all') ###################################### ## 1 dimensional example @@ -42,6 +43,7 @@ print m.checkgrad() #check gradient FIXME unit test please # optimize and plot #m.optimize('tnc', messages = 1) +m.EM() m.plot(samples=3,full_cov=False) # print(m) diff --git a/GPy/inference/likelihoods.py b/GPy/inference/likelihoods.py index b170dc3d..acf1aa2d 100644 --- a/GPy/inference/likelihoods.py +++ b/GPy/inference/likelihoods.py @@ -83,12 +83,20 @@ class probit(likelihood): var = var.flatten() return stats.norm.cdf(mu/np.sqrt(1+var)) + def predictive_var(self,mu,var): + p=self.predictive_mean(mu,var) + return p*(1-p) + def _log_likelihood_gradients(): raise NotImplementedError - def plot(self,X,phi,X_obs,Z=None): + def plot(self,X,mu,var,phi,X_obs,Z=None,samples=0): assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X,phi,np.zeros(X.shape[0])) + phi_var = self.predictive_var(mu,var) + gpplot(X,phi,phi_var) + if samples: + phi_samples = np.vstack([np.random.binomial(1,phi.flatten()) for s in range(samples)]) + pb.plot(X,phi_samples.T,'x', alpha = 0.4, c='#3465a4' ) pb.plot(X_obs,(self.Y+1)/2,'kx',mew=1.5) if Z is not None: pb.plot(Z,Z*0+.5,'r|',mew=1.5,markersize=12) @@ -164,16 +172,22 @@ class poisson(likelihood): sigma2_hat = m2 - mu_hat**2 # Second central moment return float(Z_hat), float(mu_hat), float(sigma2_hat) - def predictive_mean(self,mu,variance): + def predictive_mean(self,mu,var): return np.exp(mu*self.scale + self.location) + def predictive_var(self,mu,var): + return predictive_mean(mu,var) + def _log_likelihood_gradients(): raise NotImplementedError - def plot(self,X,phi,X_obs,Z=None): + def plot(self,X,mu,var,phi,X_obs,Z=None,samples=0): assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X,phi,np.zeros(X.shape[0])) + gpplot(X,phi,phi.flatten()) pb.plot(X_obs,self.Y,'kx',mew=1.5) + if samples: + phi_samples = np.vstack([np.random.poisson(phi.flatten(),phi.size) for s in range(samples)]) + pb.plot(X,phi_samples.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) if Z is not None: pb.plot(Z,Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 482143d6..8222fd6a 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -285,7 +285,7 @@ class GP(model): if self.EP: pb.subplot(212) - self.likelihood.plot(Xnew,phi,self.X) + self.likelihood.plot(Xnew,m,v,phi,self.X,samples=samples) pb.xlim(xmin,xmax) elif self.X.shape[1]==2: diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index 8b1b6fb9..ba07254f 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -151,7 +151,6 @@ class sparse_GP(GP): else: self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() - print "Aqui toy" self.trbetaYYT = np.sum(np.square(self.Y)*self.beta) self._computations() From d8eb155622e51a4a4fb62118af696b7d57c21aa8 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Wed, 30 Jan 2013 16:00:03 +0000 Subject: [PATCH 19/44] Working for regression, still some bugs for EP. --- GPy/examples/sparse_ep_fix.py | 31 +++++++++++++------- GPy/models/sparse_GP.py | 55 ++++++++++++++++++----------------- 2 files changed, 50 insertions(+), 36 deletions(-) diff --git a/GPy/examples/sparse_ep_fix.py b/GPy/examples/sparse_ep_fix.py index ff90f2bb..defcb4eb 100644 --- a/GPy/examples/sparse_ep_fix.py +++ b/GPy/examples/sparse_ep_fix.py @@ -32,18 +32,29 @@ noise = GPy.kern.white(1) kernel = rbf + noise # create simple GP model -m = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood) -#m = GPy.models.sparse_GP(X, Y, kernel, M=M) +#m = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood) # contrain all parameters to be positive +#m.constrain_fixed('prec',100.) +m = GPy.models.sparse_GP(X, Y, kernel, M=M) m.ensure_default_constraints() -if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): - m.approximate_likelihood() +#if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): +# m.approximate_likelihood() print m.checkgrad() -#check gradient FIXME unit test please -# optimize and plot -#m.optimize('tnc', messages = 1) -m.EM() -m.plot(samples=3,full_cov=False) -# print(m) +m.optimize('tnc', messages = 1) +m.plot(samples=3) +print m +n = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood) +n.ensure_default_constraints() +if not isinstance(n.likelihood,GPy.inference.likelihoods.gaussian): + n.approximate_likelihood() +print n.checkgrad() +pb.figure() +n.plot() + +""" +m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M) +m.ensure_default_constraints() +print m.checkgrad() +""" diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index ba07254f..7f287174 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -10,6 +10,7 @@ from GP import GP from ..inference.EP import Full,DTC,FITC from ..inference.likelihoods import likelihood,probit,poisson,gaussian + #Still TODO: # make use of slices properly (kernel can now do this) # enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) @@ -35,12 +36,6 @@ class sparse_GP(GP): :type beta: float :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) :type normalize_(X|Y): bool - :parm likelihood: a GPy likelihood, defaults to gaussian - :param method_ep: sparse approximation used by Expectation Propagation algorithm, defaults to DTC - :type M: string (Full|DTC|FITC) - :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 - :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] - :type powerep: list """ def __init__(self,X,Y=None,kernel=None,X_uncertainty=None,beta=100.,Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,power_ep=[1.,1.]): @@ -70,20 +65,21 @@ class sparse_GP(GP): else: self.method_ep = method_ep + #normalise X uncertainty also + if self.has_uncertain_inputs: + self.X_uncertainty /= np.square(self._Xstd) def _set_params(self, p): self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) if not self.EP: self.beta = p[self.M*self.Q] - #self.beta = np.repeat(p[self.M*self.Q],self.N)[:,None] self.kern._set_params(p[self.Z.size + 1:]) - self.beta2 = self.beta**2 else: self.kern._set_params(p[self.Z.size:]) if self.Y is None: self.Y = np.ones([self.N,1]) self._compute_kernel_matrices() - self._computations() #NOTE At this point computations of dL are not needed + self._computations() def _get_params(self): if not self.EP: @@ -97,19 +93,22 @@ class sparse_GP(GP): else: return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed() + def _compute_kernel_matrices(self): # kernel computations, using BGPLVM notation #TODO: slices for psi statistics (easy enough) + self.Kmm = self.kern.K(self.Z) if self.has_uncertain_inputs: if not self.EP: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)#.sum() + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)#.sum() NOTE psi0 is now a vector self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty)#FIXME add beta vector + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + #self.psi2_beta_scaled = ? else: raise NotImplementedError, "uncertain_inputs not yet supported for EP" else: - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)#.sum() FIXME + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)#.sum() self.psi1 = self.kern.K(self.Z,self.X) self.psi2 = np.dot(self.psi1,self.psi1.T) self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T) @@ -124,22 +123,29 @@ class sparse_GP(GP): self.B = np.eye(self.M) + self.A self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) self.LLambdai = np.dot(self.LBi, self.Lmi) - self.trace_K = self.psi0.sum() - np.trace(self.A) self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) self.C = mdot(self.LLambdai, self.psi1V) self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) + self.trace_K_beta_scaled = (self.psi0*self.beta).sum() - np.trace(self.A) + if not self.EP: + self.trace_K = self.psi0.sum() - np.trace(self.A)/self.beta # Compute dL_dpsi - self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() * np.ones(self.N) self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) - #self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) - self.dL_dpsi2 = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + if not self.EP: + self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + if self.has_uncertain_inputs: + self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + else: + self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + else: + self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() + if not self.has_uncertain_inputs: + self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) # Compute dL_dKmm self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB - #self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*self.beta*mdot(self.LBL_inv, self.psi2, self.Kmmi) + self.Kmmi) # dC self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*mdot(self.LBL_inv, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC - #self.dL_dKmm += np.dot(np.dot(self.G,self.beta*self.psi2) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE self.dL_dKmm += np.dot(np.dot(self.G,self.psi2_beta_scaled) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE def approximate_likelihood(self): @@ -164,7 +170,7 @@ class sparse_GP(GP): else: A = -0.5*self.D*(self.N*np.log(2.*np.pi) - np.sum(np.log(self.beta))) D = -0.5*self.trbetaYYT - B = -0.5*self.D*self.trace_K + B = -0.5*self.D*self.trace_K_beta_scaled C = -0.5*self.D * self.B_logdet E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) return A+B+C+D+E @@ -194,7 +200,7 @@ class sparse_GP(GP): dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape else: #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1) #dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) @@ -210,7 +216,7 @@ class sparse_GP(GP): dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) else: #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1)#dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) return dL_dZ @@ -229,16 +235,14 @@ class sparse_GP(GP): Kxx = self.kern.K(Xnew) var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) if not self.EP: - var += np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case. + var += np.eye(Xnew.shape[0])/self.beta else: raise NotImplementedError, "full_cov = True not implemented for EP" - #var = np.diag(var)[:,None] - #phi = self.likelihood.predictive_mean(mu,var) else: Kxx = self.kern.Kdiag(Xnew) var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) if not self.EP: - var += 1./self.beta # TODO: This beta doesn't belong here in the EP case. + var += 1./self.beta else: phi = self.likelihood.predictive_mean(mu,var) return mu,var,phi @@ -247,7 +251,6 @@ class sparse_GP(GP): """ Plot the fitted model: just call the GP_regression plot function and then add inducing inputs """ - #GP_regression.plot(self,*args,**kwargs) GP.plot(self,*args,**kwargs) if self.Q==1: pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) From c17d4758246df2bd53fc26d0612e880d48083ece Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 31 Jan 2013 11:02:27 +0000 Subject: [PATCH 20/44] Trying to fix docs, might break them --- doc/conf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/doc/conf.py b/doc/conf.py index 474836a2..9fb5f02a 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -33,7 +33,10 @@ on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if on_rtd: sys.path.append("../GPy") os.system("pwd") - os.system("sphinx-apidoc -f -o . ../GPy") + os.system("cd ..") + #os.system("sphinx-apidoc -f -o . ../GPy") + os.system("sphinx-apidoc -f -o ./docs ./GPy") + os.system("cd ./docs") # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] From 85572836939c4cb79f810f5e6202e0f201e9b8e6 Mon Sep 17 00:00:00 2001 From: Alan Saul Date: Thu, 31 Jan 2013 11:08:25 +0000 Subject: [PATCH 21/44] Changed docs back for newGP --- doc/conf.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/doc/conf.py b/doc/conf.py index 9fb5f02a..474836a2 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -33,10 +33,7 @@ on_rtd = os.environ.get('READTHEDOCS', None) == 'True' if on_rtd: sys.path.append("../GPy") os.system("pwd") - os.system("cd ..") - #os.system("sphinx-apidoc -f -o . ../GPy") - os.system("sphinx-apidoc -f -o ./docs ./GPy") - os.system("cd ./docs") + os.system("sphinx-apidoc -f -o . ../GPy") # Add any paths that contain templates here, relative to this directory. templates_path = ['_templates'] From ea0802d9388e0a9476c79045ffffefca4b0f00b3 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 31 Jan 2013 12:00:57 +0000 Subject: [PATCH 22/44] much tidying and breakage in the GP class --- GPy/inference/likelihoods.py | 9 +- GPy/models/GP.py | 189 ++++++++++++----------------------- 2 files changed, 72 insertions(+), 126 deletions(-) diff --git a/GPy/inference/likelihoods.py b/GPy/inference/likelihoods.py index acf1aa2d..4c8090f6 100644 --- a/GPy/inference/likelihoods.py +++ b/GPy/inference/likelihoods.py @@ -196,6 +196,9 @@ class gaussian(likelihood): Gaussian likelihood Y is expected to take values in (-inf,inf) """ + self.variance = variance + self._data = Y + self. def moments_match(self,i,tau_i,v_i): """ Moments match of the marginal approximation in EP algorithm @@ -219,8 +222,8 @@ class gaussian(likelihood): if U is not None: pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12) - def predictive_mean(self,mu,Sigma): - return mu - def _log_likelihood_gradients(): raise NotImplementedError + else: + var = var[:,None] * np.square(self._Ystd) + diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 8222fd6a..f5a0711d 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -8,23 +8,22 @@ from .. import kern from ..core import model from ..util.linalg import pdinv,mdot from ..util.plot import gpplot, Tango -from ..inference.EP import Full -from ..inference.likelihoods import likelihood,probit,poisson,gaussian +from ..inference.EP import Full # TODO: tidy +from ..inference import likelihoods class GP(model): """ Gaussian Process model for regression and EP :param X: input observations - :param Y: observed values :param kernel: a GPy kernel, defaults to rbf+white + :parm likelihood: a GPy likelihood :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) :type normalize_X: False|True :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) :type normalize_Y: False|True :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) :rtype: model object - :parm likelihood: a GPy likelihood, defaults to gaussian :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] :type powerep: list @@ -32,23 +31,19 @@ class GP(model): .. Note:: Multiple independent outputs are allowed using columns of Y """ - #TODO: make beta parameter explicit #TODO: when using EP, predict needs to return 3 values otherwise it just needs 2. At the moment predict returns 3 values in any case. - def __init__(self,X,Y=None,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,likelihood=None,epsilon_ep=1e-3,power_ep=[1.,1.]): + def __init__(self, X, kernel, likelihood, normalize_X=False, Xslices=None): # parse arguments self.Xslices = Xslices self.X = X - self.N, self.Q = self.X.shape assert len(self.X.shape)==2 - if kernel is None: - kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) - else: - assert isinstance(kernel, kern.kern) + self.N, self.Q = self.X.shape + assert isinstance(kernel, kern.kern) self.kern = kernel - #here's some simple normalisation + #here's some simple normalisation for the inputs if normalize_X: self._Xmean = X.mean(0)[None,:] self._Xstd = X.std(0)[None,:] @@ -59,82 +54,48 @@ class GP(model): self._Xmean = np.zeros((1,self.X.shape[1])) self._Xstd = np.ones((1,self.X.shape[1])) - # Y - likelihood related variables, these might change whether using EP or not - if likelihood is None: - assert Y is not None, "Either Y or likelihood must be defined" - self.likelihood = gaussian(Y) - else: - self.likelihood = likelihood - assert len(self.likelihood.Y.shape)==2 + self.likelihood = likelihood + self.Y = self.likelihood.Y + self.YYT = self.likelihood.YYT # TODO: this is ugly. what about sufficient_stats? assert self.X.shape[0] == self.likelihood.Y.shape[0] self.N, self.D = self.likelihood.Y.shape - if isinstance(self.likelihood,gaussian): - self.EP = False - self.Y = Y - self.beta = 100.#FIXME beta should be an explicit parameter for this model - # Here's some simple normalisation - if normalize_Y: - self._Ymean = Y.mean(0)[None,:] - self._Ystd = Y.std(0)[None,:] - self.Y = (Y.copy()- self._Ymean) / self._Ystd - else: - self._Ymean = np.zeros((1,self.Y.shape[1])) - self._Ystd = np.ones((1,self.Y.shape[1])) - - if self.D > self.N: - # then it's more efficient to store YYT - self.YYT = np.dot(self.Y, self.Y.T) - else: - self.YYT = None - else: - if self.D > 1: - raise NotImplementedError, "EP is not implemented for D > 1" - # Y is defined after approximating the likelihood - self.EP = True - self.eta,self.delta = power_ep - self.epsilon_ep = epsilon_ep - self.beta = np.ones([self.N,self.D]) - self.Z_ep = 0 - self.Y = None - self._Ymean = np.zeros((1,self.D)) - self._Ystd = np.ones((1,self.D)) - model.__init__(self) def _set_params(self,p): - # TODO: add beta when not using EP - self.kern._set_params_transformed(p) + self.kern._set_params_transformed(p[:self.kern.Nparam]) + self.likelihood._set_params(p[self.kern.Nparam:]) + self.K = self.kern.K(self.X,slices1=self.Xslices) - if self.EP: - self.K += np.diag(1./self.beta.flatten()) - #else: - # self.beta = p[-1] + self.K += np.diag(self.likelihood_variance) + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + #the gradient of the likelihood wrt the covariance matrix + if self.YYT is None: + self._alpha = np.dot(self.Ki,self.Y) + self._alpha2 = np.square(self._alpha) + self.dL_dK = 0.5*(np.dot(self._alpha,self._alpha.T)-self.D*self.Ki) + else: + tmp = mdot(self.Ki, self.YYT, self.Ki) + self._alpha2 = np.diag(tmp) + self.dL_dK = 0.5*(tmp - self.D*self.Ki) + def _get_params(self): - # TODO: add beta when not using EP - return self.kern._get_params_transformed() + return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params())) def _get_param_names(self): - # TODO: add beta when not using EP - return self.kern._get_param_names_transformed() + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() - def approximate_likelihood(self): + def update_likelihood_approximation(self): """ Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian (or direct: TODO) likelihood, no iteration is required: + this function does nothing """ - assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" - self.ep_approx = Full(self.K,self.likelihood,epsilon = self.epsilon_ep,power_ep=[self.eta,self.delta]) - self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() - if self.D > self.N: - # then it's more efficient to store YYT - self.YYT = np.dot(self.Y, self.Y.T) - else: - self.YYT = None - # Kernel plus noise variance term - self.K = self.kern.K(self.X,slices1=self.Xslices) + np.diag(1./self.beta.flatten()) - self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + self.likelihood.fit(self.K) + self.Y, self.YYT, self.likelihood_variance, self.likelihood_Z = self.likelihood.sufficient_stats() # TODO: just store these in the likelihood? def _model_fit_term(self): """ @@ -147,29 +108,41 @@ class GP(model): def log_likelihood(self): """ - The log marginal likelihood for an EP model can be written as the log likelihood of - a regression model for a new variable Y* = v_tilde/tau_tilde, with a covariance + The log marginal likelihood of the GP. + + For an EP model, can be written as the log likelihood of a regression + model for a new variable Y* = v_tilde/tau_tilde, with a covariance matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ - L = -0.5*selff.D*self.K_logdet + self.model_fit_term() - if self.EP: - L += self.normalisation_term() - return L + return -0.5*self.D*self.K_logdet + self.model_fit_term() + self.likelihood.Z - def log_likelihood(self): - complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet - return complexity_term + self._model_fit_term() - - def dL_dK(self): - if self.YYT is None: - alpha = np.dot(self.Ki,self.Y) - dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) - else: - dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki) - return dL_dK def _log_likelihood_gradients(self): - return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X) + """ + The gradient of all parameters. + + For the kernel parameters, use the chain rule via dL_dK + + For the likelihood parameters, pass in alpha = K^-1 y + """ + return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X), self.likelihood._gradients(self.alpha2))) + + def _raw_predict(self,_Xnew,slices, full_cov=False): + """ + Internal helper function for making predictions, does not account + for normalisation or likelihood + """ + Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) + mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) + KiKx = np.dot(self.Ki,Kx) + if full_cov: + Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) + var = Kxx - np.dot(KiKx.T,Kx) + else: + Kxx = self.kern.Kdiag(_Xnew, slices=slices) + var = Kxx - np.sum(np.multiply(KiKx,Kx),0) + return mu, var + def predict(self,Xnew, slices=None, full_cov=False): """ @@ -198,41 +171,11 @@ class GP(model): """ #normalise X values Xnew = (Xnew.copy() - self._Xmean) / self._Xstd - mu, var, phi = self._raw_predict(Xnew, slices, full_cov) + mu, var, phi = self._raw_predict(Xnew, slices, full_cov=full_cov) - #un-normalise - mu = mu*self._Ystd + self._Ymean - if full_cov: - if self.D==1: - var *= np.square(self._Ystd) - else: - var = var[:,:,None] * np.square(self._Ystd) - else: - if self.D==1: - var *= np.square(np.squeeze(self._Ystd)) - else: - var = var[:,None] * np.square(self._Ystd) + #now push through likelihood TODO - return mu,var,phi - - def _raw_predict(self,_Xnew,slices, full_cov=False): - """Internal helper function for making predictions, does not account for normalisation""" - Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) - mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) - KiKx = np.dot(self.Ki,Kx) - if full_cov: - Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) - var = Kxx - np.dot(KiKx.T,Kx) - if self.EP: - raise NotImplementedError, "full_cov = True not implemented for EP" - #var = np.diag(var)[:,None] - #phi = self.likelihood.predictive_mean(mu,var) - else: - Kxx = self.kern.Kdiag(_Xnew, slices=slices) - var = Kxx - np.sum(np.multiply(KiKx,Kx),0) - if self.EP: - phi = self.likelihood.predictive_mean(mu,var) - return mu, var, phi + return mean, _5pc, _95pc def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): """ From a6851cf63d8bdb2defcd9de025f74506141ab7a9 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 31 Jan 2013 12:28:52 +0000 Subject: [PATCH 23/44] massive restructuting to make the EP likelihoods work consistently --- GPy/{inference => likelihoods}/EP.py | 116 ++++++++---------- .../likelihood_functions.py} | 37 +++--- 2 files changed, 67 insertions(+), 86 deletions(-) rename GPy/{inference => likelihoods}/EP.py (76%) rename GPy/{inference/likelihoods.py => likelihoods/likelihood_functions.py} (91%) diff --git a/GPy/inference/EP.py b/GPy/likelihoods/EP.py similarity index 76% rename from GPy/inference/EP.py rename to GPy/likelihoods/EP.py index c3aad7c1..1519bf3b 100644 --- a/GPy/inference/EP.py +++ b/GPy/likelihoods/EP.py @@ -9,7 +9,7 @@ from ..util.plot import gpplot from .. import kern class EP: - def __init__(self,covariance,likelihood,Kmn=None,Knn_diag=None,epsilon=1e-3,power_ep=[1.,1.]): + def __init__(self,data,likelihood_function,epsilon=1e-3,power_ep=[1.,1.]): """ Expectation Propagation @@ -22,24 +22,10 @@ class EP: power_ep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats) epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) """ - self.likelihood = likelihood - assert covariance.shape[0] == covariance.shape[1] - if Kmn is not None: - self.Kmm = covariance - self.Kmn = Kmn - self.M = self.Kmn.shape[0] - self.N = self.Kmn.shape[1] - assert self.M < self.N, 'The number of inducing inputs must be smaller than the number of observations' - else: - self.K = covariance - self.N = self.K.shape[0] - if Knn_diag is not None: - self.Knn_diag = Knn_diag - assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N' - + self.likelihood_function = likelihood_function self.epsilon = epsilon self.eta, self.delta = power_ep - self.jitter = 1e-12 + self.jitter = 1e-12 # TODO: is this needed? """ Initial values - Likelihood approximation parameters: @@ -54,10 +40,9 @@ class EP: sigma_sum = 1./self.tau_ + 1./self.tau_tilde mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 Z_ep = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant - return self.tau_tilde[:,None], mu_tilde[:,None], Z_ep + self.Y, self.beta, self.Z = self.tau_tilde[:,None], mu_tilde[:,None], Z_ep -class Full(EP): - def fit_EP(self): + def fit_full(self,K): """ The expectation-propagation algorithm. For nomenclature see Rasmussen & Williams 2006. @@ -66,8 +51,8 @@ class Full(EP): #self.K = self.kernel.K(self.X,self.X) #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) - self.mu=np.zeros(self.N) - self.Sigma=self.K.copy() + self.mu = np.zeros(self.N) + self.Sigma = K.copy() """ Initial values - Cavity distribution parameters: @@ -111,11 +96,11 @@ class Full(EP): self.mu = np.dot(self.Sigma,self.v_tilde) self.iterations += 1 #Sigma recomptutation with Cholesky decompositon - Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*(self.K) + Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K L = jitchol(B) V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1) - self.Sigma = self.K - np.dot(V.T,V) + self.Sigma = K - np.dot(V.T,V) self.mu = np.dot(self.Sigma,self.v_tilde) epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N @@ -124,32 +109,33 @@ class Full(EP): return self._compute_GP_variables() -class DTC(EP): - def fit_EP(self): + def fit_DTC(self, Knn_diag, Kmn, Kmm): """ The expectation-propagation algorithm with sparse pseudo-input. For nomenclature see ... 2013. """ + #TODO: this doesn;t work with uncertain inputs! + """ Prior approximation parameters: q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) Sigma0 = Qnn = Knm*Kmmi*Kmn """ - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - self.KmnKnm = np.dot(self.Kmn, self.Kmn.T) - self.KmmiKmn = np.dot(self.Kmmi,self.Kmn) - self.Qnn_diag = np.sum(self.Kmn*self.KmmiKmn,-2) - self.LLT0 = self.Kmm.copy() + Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm) + KmnKnm = np.dot(Kmn, Kmn.T) + KmmiKmn = np.dot(Kmmi,self.Kmn) + Qnn_diag = np.sum(Kmn*KmmiKmn,-2) + LLT0 = Kmm.copy() """ Posterior approximation: q(f|y) = N(f| mu, Sigma) Sigma = Diag + P*R.T*R*P.T + K mu = w + P*gamma """ - self.mu = np.zeros(self.N) - self.LLT = self.Kmm.copy() - self.Sigma_diag = self.Qnn_diag.copy() + mu = np.zeros(self.N) + LLT = Kmm.copy() + Sigma_diag = Qnn_diag.copy() """ Initial values - Cavity distribution parameters: @@ -157,12 +143,12 @@ class DTC(EP): sigma_ = 1./tau_ mu_ = v_/tau_ """ - self.tau_ = np.empty(self.N,dtype=float) - self.v_ = np.empty(self.N,dtype=float) + tau_ = np.empty(self.N,dtype=float) + v_ = np.empty(self.N,dtype=float) #Initial values - Marginal moments z = np.empty(self.N,dtype=float) - self.Z_hat = np.empty(self.N,dtype=float) + Z_hat = np.empty(self.N,dtype=float) phi = np.empty(self.N,dtype=float) mu_hat = np.empty(self.N,dtype=float) sigma2_hat = np.empty(self.N,dtype=float) @@ -171,47 +157,45 @@ class DTC(EP): epsilon_np1 = 1 epsilon_np2 = 1 self.iterations = 0 - self.np1 = [self.tau_tilde.copy()] - self.np2 = [self.v_tilde.copy()] + np1 = [tau_tilde.copy()] + np2 = [v_tilde.copy()] while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.arange(self.N) - random.shuffle(update_order) + update_order = np.random.permutation(self.N) for i in update_order: #Cavity distribution parameters - self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i] - self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i] + tau_[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i] + v_[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i] #Marginal moments - self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) + Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],tau_[i],v_[i]) #Site parameters update - Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i]) - Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i]) - self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau - self.v_tilde[i] = self.v_tilde[i] + Delta_v + Delta_tau = delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i]) + tau_tilde[i] = tau_tilde[i] + Delta_tau + v_tilde[i] = v_tilde[i] + Delta_v #Posterior distribution parameters update - self.LLT = self.LLT + np.outer(self.Kmn[:,i],self.Kmn[:,i])*Delta_tau - L = jitchol(self.LLT) - V,info = linalg.flapack.dtrtrs(L,self.Kmn,lower=1) - self.Sigma_diag = np.sum(V*V,-2) + LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau + L = jitchol(LLT) + V,info = linalg.flapack.dtrtrs(L,Kmn,lower=1) + Sigma_diag = np.sum(V*V,-2) si = np.sum(V.T*V[:,i],-1) - self.mu = self.mu + (Delta_v-Delta_tau*self.mu[i])*si + mu = mu + (Delta_v-Delta_tau*mu[i])*si self.iterations += 1 #Sigma recomputation with Cholesky decompositon - self.LLT0 = self.LLT0 + np.dot(self.Kmn*self.tau_tilde[None,:],self.Kmn.T) - self.L = jitchol(self.LLT) - V,info = linalg.flapack.dtrtrs(L,self.Kmn,lower=1) + LLT0 = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T) + L = jitchol(LLT) + V,info = linalg.flapack.dtrtrs(L,Kmn,lower=1) V2,info = linalg.flapack.dtrtrs(L.T,V,lower=0) - self.Sigma_diag = np.sum(V*V,-2) - Knmv_tilde = np.dot(self.Kmn,self.v_tilde) - self.mu = np.dot(V2.T,Knmv_tilde) - epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N - epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N - self.np1.append(self.tau_tilde.copy()) - self.np2.append(self.v_tilde.copy()) + Sigma_diag = np.sum(V*V,-2) + Knmv_tilde = np.dot(Kmn,v_tilde) + mu = np.dot(V2.T,Knmv_tilde) + epsilon_np1 = sum((tau_tilde-np1[-1])**2)/self.N + epsilon_np2 = sum((v_tilde-np2[-1])**2)/self.N + np1.append(tau_tilde.copy()) + np2.append(v_tilde.copy()) - return self._compute_GP_variables() + self._compute_GP_variables() -class FITC(EP): - def fit_EP(self): + def fit_FITC(self, Knn_diag, Kmn): """ The expectation-propagation algorithm with sparse pseudo-input. For nomenclature see Naish-Guzman and Holden, 2008. diff --git a/GPy/inference/likelihoods.py b/GPy/likelihoods/likelihood_functions.py similarity index 91% rename from GPy/inference/likelihoods.py rename to GPy/likelihoods/likelihood_functions.py index 4c8090f6..1387c53d 100644 --- a/GPy/inference/likelihoods.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Copyright (c) 2012, 2013 Ricardo Andrade # Licensed under the BSD 3-clause license (see LICENSE.txt) @@ -15,9 +15,7 @@ class likelihood: :param Y: observed output (Nx1 numpy.darray) ..Note:: Y values allowed depend on the likelihood used """ - def __init__(self,Y,location=0,scale=1): - self.Y = Y - self.N = self.Y.shape[0] + def __init__(self,location=0,scale=1): self.location = location self.scale = scale @@ -59,11 +57,10 @@ class probit(likelihood): L(x) = \\Phi (Y_i*f_i) $$ """ - def __init__(self,Y,location=0,scale=1): - assert np.sum(np.abs(Y)-1) == 0, "Output values must be either -1 or 1" + def __init__(self,location=0,scale=1): likelihood.__init__(self,Y,location,scale) - def moments_match(self,i,tau_i,v_i): + def moments_match(self,data_i,tau_i,v_i): """ Moments match of the marginal approximation in EP algorithm @@ -71,10 +68,11 @@ class probit(likelihood): :param tau_i: precision of the cavity distribution (float) :param v_i: mean/variance of the cavity distribution (float) """ - z = self.Y[i]*v_i/np.sqrt(tau_i**2 + tau_i) + # TODO: some version of assert np.sum(np.abs(Y)-1) == 0, "Output values must be either -1 or 1" + z = data_i*v_i/np.sqrt(tau_i**2 + tau_i) Z_hat = stats.norm.cdf(z) phi = stats.norm.pdf(z) - mu_hat = v_i/tau_i + self.Y[i]*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i)) + mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i)) sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat) return Z_hat, mu_hat, sigma2_hat @@ -83,14 +81,16 @@ class probit(likelihood): var = var.flatten() return stats.norm.cdf(mu/np.sqrt(1+var)) - def predictive_var(self,mu,var): - p=self.predictive_mean(mu,var) - return p*(1-p) + def predictive_quantiles(self,mu,var): + #p=self.predictive_mean(mu,var) + #return p*(1-p) + raise NotImplementedError #TODO def _log_likelihood_gradients(): - raise NotImplementedError + return np.zeros(0) # there are no parameters of whcih to compute the gradients def plot(self,X,mu,var,phi,X_obs,Z=None,samples=0): + #TODO: remove me assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' phi_var = self.predictive_var(mu,var) gpplot(X,phi,phi_var) @@ -192,13 +192,10 @@ class poisson(likelihood): pb.plot(Z,Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) class gaussian(likelihood): - """ - Gaussian likelihood - Y is expected to take values in (-inf,inf) - """ - self.variance = variance - self._data = Y - self. + """ + Gaussian likelihood + Y is expected to take values in (-inf,inf) + """ def moments_match(self,i,tau_i,v_i): """ Moments match of the marginal approximation in EP algorithm From bdc89170d423591f78893da199b7c33fc255e55f Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 31 Jan 2013 13:34:30 +0000 Subject: [PATCH 24/44] added a Gaussian likelihood class --- GPy/likelihoods/Gaussian.py | 16 ++++++++++++++++ GPy/likelihoods/__init__.py | 3 +++ 2 files changed, 19 insertions(+) create mode 100644 GPy/likelihoods/Gaussian.py create mode 100644 GPy/likelihoods/__init__.py diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py new file mode 100644 index 00000000..2397ce38 --- /dev/null +++ b/GPy/likelihoods/Gaussian.py @@ -0,0 +1,16 @@ +import numpy as np + +class Gaussian: + def __init__(self,data,variance=1.,normalise=False): + self.data = data + if normalise: + foo + self._variance = variance + def _get_params(self): + return np.asarray(self.variance) + def _set_params(self,x): + self._variance = x + def fit(self): + pass + def _gradients(self,foo): + return bar(foo) diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py new file mode 100644 index 00000000..d1369c43 --- /dev/null +++ b/GPy/likelihoods/__init__.py @@ -0,0 +1,3 @@ +from EP import EP +from Gaussian import Gaussian +# TODO: from Laplace import Laplace From 9feae765dc2253edaa37b25e3417a364e5b9acdc Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Thu, 31 Jan 2013 14:43:32 +0000 Subject: [PATCH 25/44] predictive_mean changed to predictive_values --- GPy/likelihoods/likelihood_functions.py | 84 +++++++------------------ GPy/models/GP.py | 7 +-- 2 files changed, 25 insertions(+), 66 deletions(-) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 1387c53d..49547b88 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -19,35 +19,6 @@ class likelihood: self.location = location self.scale = scale - def plot2D(self,X,X_new,F_new,U=None): - """ - Predictive distribution of the fitted GP model for 2-dimensional inputs - - :param X_new: The points at which to make a prediction - :param Mean_new: mean values at X_new - :param Var_new: variance values at X_new - :param X_u: input points used to train the model - :param Mean_u: mean values at X_u - :param Var_new: variance values at X_u - """ - N,D = X_new.shape - assert D == 2, 'Number of dimensions must be 2' - n = np.sqrt(N) - x1min = X_new[:,0].min() - x1max = X_new[:,0].max() - x2min = X_new[:,1].min() - x2max = X_new[:,1].max() - pb.imshow(F_new.reshape(n,n),extent=(x1min,x1max,x2max,x2min),vmin=0,vmax=1) - pb.colorbar() - C1 = np.arange(self.N)[self.Y.flatten()==1] - C2 = np.arange(self.N)[self.Y.flatten()==-1] - [pb.plot(X[i,0],X[i,1],'ro') for i in C1] - [pb.plot(X[i,0],X[i,1],'bo') for i in C2] - pb.xlim(x1min,x1max) - pb.ylim(x2min,x2max) - if U is not None: - [pb.plot(a,b,'wo') for a,b in U] - class probit(likelihood): """ Probit likelihood @@ -76,32 +47,23 @@ class probit(likelihood): sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat) return Z_hat, mu_hat, sigma2_hat - def predictive_mean(self,mu,var): + def predictive_values(self,mu,var,all=False): + """ + Compute mean, variance, and conficence interval (percentiles 5 and 95) of the prediction + """ mu = mu.flatten() var = var.flatten() - return stats.norm.cdf(mu/np.sqrt(1+var)) - - def predictive_quantiles(self,mu,var): - #p=self.predictive_mean(mu,var) - #return p*(1-p) - raise NotImplementedError #TODO + mean = stats.norm.cdf(mu/np.sqrt(1+var)) + if all: + p_05 = np.zeros([mu.size]) + p_95 = np.ones([mu.size]) + return mean, mean*(1-mean),p_05,p_95 + else: + return mean def _log_likelihood_gradients(): return np.zeros(0) # there are no parameters of whcih to compute the gradients - def plot(self,X,mu,var,phi,X_obs,Z=None,samples=0): - #TODO: remove me - assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' - phi_var = self.predictive_var(mu,var) - gpplot(X,phi,phi_var) - if samples: - phi_samples = np.vstack([np.random.binomial(1,phi.flatten()) for s in range(samples)]) - pb.plot(X,phi_samples.T,'x', alpha = 0.4, c='#3465a4' ) - pb.plot(X_obs,(self.Y+1)/2,'kx',mew=1.5) - if Z is not None: - pb.plot(Z,Z*0+.5,'r|',mew=1.5,markersize=12) - pb.ylim(-0.2,1.2) - class poisson(likelihood): """ Poisson likelihood @@ -172,11 +134,18 @@ class poisson(likelihood): sigma2_hat = m2 - mu_hat**2 # Second central moment return float(Z_hat), float(mu_hat), float(sigma2_hat) - def predictive_mean(self,mu,var): - return np.exp(mu*self.scale + self.location) - - def predictive_var(self,mu,var): - return predictive_mean(mu,var) + def predictive_values(self,mu,var,all=False): + """ + Compute mean, variance, and conficence interval (percentiles 5 and 95) of the prediction + """ + mean = np.exp(mu*self.scale + self.location) + if all: + tmp = stats.poisson.ppf(np.array([.05,.95]),mu) + p_05 = tmp[:,0] + p_95 = tmp[:,1] + return mean,mean,p_05,p_95 + else: + return mean def _log_likelihood_gradients(): raise NotImplementedError @@ -212,13 +181,6 @@ class gaussian(likelihood): Z_hat = 1./np.sqrt(2*np.pi) * 1./np.sqrt(sigma**2+s**2) * np.exp(-.5*(mu-self.Y[i])**2/(sigma**2 + s**2)) return Z_hat, mu_hat, sigma2_hat - def plot1Db(self,X,X_new,F_new,U=None): - assert X.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X_new,F_new,np.zeros(X_new.shape[0])) - pb.plot(X,self.Y,'kx',mew=1.5) - if U is not None: - pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12) - def _log_likelihood_gradients(): raise NotImplementedError else: diff --git a/GPy/models/GP.py b/GPy/models/GP.py index f5a0711d..dfd22d9c 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -215,11 +215,10 @@ class GP(model): if self.X.shape[1]==1: Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - m,v,phi = self.predict(Xnew,slices=which_functions,full_cov=full_cov) + m,v = self.predict(Xnew,slices=which_functions,full_cov=full_cov) if self.EP: pb.subplot(211) gpplot(Xnew,m,v) - if samples: #NOTE why don't we put samples as a parameter of gpplot s = np.random.multivariate_normal(m.flatten(),np.diag(v.flatten()),samples) pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) @@ -227,9 +226,7 @@ class GP(model): pb.xlim(xmin,xmax) if self.EP: - pb.subplot(212) - self.likelihood.plot(Xnew,m,v,phi,self.X,samples=samples) - pb.xlim(xmin,xmax) + phi_m, phi_v, phi_l, phi_u = self.likelihood.predictive_values(m,v) elif self.X.shape[1]==2: resolution = 50 or resolution From d077d28fd1667645f2776d96e2b7914964263821 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 31 Jan 2013 15:02:34 +0000 Subject: [PATCH 26/44] very basic functionality is now working --- GPy/__init__.py | 2 +- GPy/likelihoods/EP.py | 10 +- GPy/likelihoods/Gaussian.py | 37 ++++- GPy/likelihoods/__init__.py | 1 + GPy/likelihoods/likelihood_functions.py | 11 +- GPy/models/GP.py | 70 ++++---- GPy/models/GP_regression.py | 209 +----------------------- GPy/models/__init__.py | 14 +- GPy/models/sparse_GP.py | 2 - GPy/util/plot.py | 16 +- 10 files changed, 88 insertions(+), 284 deletions(-) diff --git a/GPy/__init__.py b/GPy/__init__.py index 381d6232..c0772c27 100644 --- a/GPy/__init__.py +++ b/GPy/__init__.py @@ -7,5 +7,5 @@ import models import inference import util import examples -#import examples TODO: discuss! from core import priors +import likelihoods diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index 1519bf3b..3e975436 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -1,12 +1,9 @@ import numpy as np import random -import pylab as pb #TODO erase me from scipy import stats, linalg -from .likelihoods import likelihood from ..core import model from ..util.linalg import pdinv,mdot,jitchol from ..util.plot import gpplot -from .. import kern class EP: def __init__(self,data,likelihood_function,epsilon=1e-3,power_ep=[1.,1.]): @@ -15,12 +12,8 @@ class EP: Arguments --------- - X : input observations - likelihood : Output's likelihood (likelihood class) - kernel : a GPy kernel (kern class) - inducing : Either an array specifying the inducing points location or a sacalar defining their number. None value for using a non-sparse model is used. - power_ep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats) epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) + likelihood_function : a likelihood function (see likelihood_functions.py) """ self.likelihood_function = likelihood_function self.epsilon = epsilon @@ -48,7 +41,6 @@ class EP: For nomenclature see Rasmussen & Williams 2006. """ #Prior distribution parameters: p(f|X) = N(f|0,K) - #self.K = self.kernel.K(self.X,self.X) #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) self.mu = np.zeros(self.N) diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index 2397ce38..fe954b78 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -1,16 +1,39 @@ import numpy as np class Gaussian: - def __init__(self,data,variance=1.,normalise=False): + def __init__(self,data,variance=1.,normalize=False): self.data = data - if normalise: - foo - self._variance = variance + self.N,D = data.shape + self.Z = 0. # a correction factor which accounts for the approximation made + + #normalisation + if normalize: + self._mean = data.mean(0)[None,:] + self._std = data.std(0)[None,:] + self.Y = (self.data - self._mean)/self._std + else: + self._mean = np.zeros((1,D)) + self._std = np.ones((1,D)) + self.Y = self.data + + self.YYT = np.dot(self.Y,self.Y.T) + self._set_params(np.asarray(variance)) + def _get_params(self): - return np.asarray(self.variance) + return np.asarray(self._variance) + + def _get_param_names(self): + return ["noise variance"] + def _set_params(self,x): self._variance = x + self.variance = np.eye(self.N)*self._variance + def fit(self): + """ + No approximations needed + """ pass - def _gradients(self,foo): - return bar(foo) + + def _gradients(self,partial): + return np.sum(np.diag(partial)) diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py index d1369c43..83413255 100644 --- a/GPy/likelihoods/__init__.py +++ b/GPy/likelihoods/__init__.py @@ -1,3 +1,4 @@ from EP import EP from Gaussian import Gaussian # TODO: from Laplace import Laplace +import likelihood_functions as functions diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 1387c53d..7e6a5ba1 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -192,10 +192,10 @@ class poisson(likelihood): pb.plot(Z,Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) class gaussian(likelihood): - """ - Gaussian likelihood - Y is expected to take values in (-inf,inf) - """ + """ + Gaussian likelihood + Y is expected to take values in (-inf,inf) + """ def moments_match(self,i,tau_i,v_i): """ Moments match of the marginal approximation in EP algorithm @@ -221,6 +221,3 @@ class gaussian(likelihood): def _log_likelihood_gradients(): raise NotImplementedError - else: - var = var[:,None] * np.square(self._Ystd) - diff --git a/GPy/models/GP.py b/GPy/models/GP.py index f5a0711d..827b94b7 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -8,8 +8,6 @@ from .. import kern from ..core import model from ..util.linalg import pdinv,mdot from ..util.plot import gpplot, Tango -from ..inference.EP import Full # TODO: tidy -from ..inference import likelihoods class GP(model): """ @@ -55,8 +53,6 @@ class GP(model): self._Xstd = np.ones((1,self.X.shape[1])) self.likelihood = likelihood - self.Y = self.likelihood.Y - self.YYT = self.likelihood.YYT # TODO: this is ugly. what about sufficient_stats? assert self.X.shape[0] == self.likelihood.Y.shape[0] self.N, self.D = self.likelihood.Y.shape @@ -67,18 +63,16 @@ class GP(model): self.likelihood._set_params(p[self.kern.Nparam:]) self.K = self.kern.K(self.X,slices1=self.Xslices) - self.K += np.diag(self.likelihood_variance) + self.K += self.likelihood.variance self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) #the gradient of the likelihood wrt the covariance matrix - if self.YYT is None: - self._alpha = np.dot(self.Ki,self.Y) - self._alpha2 = np.square(self._alpha) - self.dL_dK = 0.5*(np.dot(self._alpha,self._alpha.T)-self.D*self.Ki) + if self.likelihood.YYT is None: + alpha = np.dot(self.Ki,self.likelihood.Y) + self.dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) else: - tmp = mdot(self.Ki, self.YYT, self.Ki) - self._alpha2 = np.diag(tmp) + tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki) self.dL_dK = 0.5*(tmp - self.D*self.Ki) def _get_params(self): @@ -95,16 +89,15 @@ class GP(model): this function does nothing """ self.likelihood.fit(self.K) - self.Y, self.YYT, self.likelihood_variance, self.likelihood_Z = self.likelihood.sufficient_stats() # TODO: just store these in the likelihood? def _model_fit_term(self): """ Computes the model fit using YYT if it's available """ - if self.YYT is None: - return -0.5*np.sum(np.square(np.dot(self.Li,self.Y))) + if self.likelihood.YYT is None: + return -0.5*np.sum(np.square(np.dot(self.Li,self.likelihood.Y))) else: - return -0.5*np.sum(np.multiply(self.Ki, self.YYT)) + return -0.5*np.sum(np.multiply(self.Ki, self.likelihood.YYT)) def log_likelihood(self): """ @@ -114,7 +107,7 @@ class GP(model): model for a new variable Y* = v_tilde/tau_tilde, with a covariance matrix K* = K + diag(1./tau_tilde) plus a normalization term. """ - return -0.5*self.D*self.K_logdet + self.model_fit_term() + self.likelihood.Z + return -0.5*self.D*self.K_logdet + self._model_fit_term() + self.likelihood.Z def _log_likelihood_gradients(self): @@ -125,7 +118,7 @@ class GP(model): For the likelihood parameters, pass in alpha = K^-1 y """ - return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X), self.likelihood._gradients(self.alpha2))) + return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK,X=self.X), self.likelihood._gradients(partial=self.dL_dK))) def _raw_predict(self,_Xnew,slices, full_cov=False): """ @@ -133,7 +126,7 @@ class GP(model): for normalisation or likelihood """ Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) - mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) + mu = np.dot(np.dot(Kx.T,self.Ki),self.likelihood.Y) KiKx = np.dot(self.Ki,Kx) if full_cov: Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) @@ -177,8 +170,10 @@ class GP(model): return mean, _5pc, _95pc - def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): + def raw_plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): """ + Plot the GP's view of the world, where the data is normalised and the likelihood is Gaussian + :param samples: the number of a posteriori samples to plot :param which_data: which if the training data to plot (default all) :type which_data: 'all' or a slice object to slice self.X, self.Y @@ -194,19 +189,17 @@ class GP(model): Can plot only part of the data and part of the posterior functions using which_data and which_functions """ + if which_functions=='all': which_functions = [True]*self.kern.Nparts if which_data=='all': which_data = slice(None) X = self.X[which_data,:] - Y = self.Y[which_data,:] - - Xorig = X*self._Xstd + self._Xmean - Yorig = Y*self._Ystd + self._Ymean #NOTE For EP this is v_tilde/beta + Y = self.likelihood.Y[which_data,:] if plot_limits is None: - xmin,xmax = Xorig.min(0),Xorig.max(0) + xmin,xmax = X.min(0),X.max(0) xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) elif len(plot_limits)==2: xmin, xmax = plot_limits @@ -215,27 +208,17 @@ class GP(model): if self.X.shape[1]==1: Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - m,v,phi = self.predict(Xnew,slices=which_functions,full_cov=full_cov) - if self.EP: - pb.subplot(211) - gpplot(Xnew,m,v) + m,v = self._raw_predict(Xnew,slices=which_functions,full_cov=False) + lower, upper = m.flatten() - 2.*np.sqrt(v) , m.flatten()+ 2.*np.sqrt(v) + gpplot(Xnew,m,lower,upper) - if samples: #NOTE why don't we put samples as a parameter of gpplot - s = np.random.multivariate_normal(m.flatten(),np.diag(v.flatten()),samples) - pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - pb.plot(Xorig,Yorig,'kx',mew=1.5) + pb.plot(X,Y,'kx',mew=1.5) pb.xlim(xmin,xmax) - - if self.EP: - pb.subplot(212) - self.likelihood.plot(Xnew,m,v,phi,self.X,samples=samples) - pb.xlim(xmin,xmax) - elif self.X.shape[1]==2: - resolution = 50 or resolution + resolution = resolution or 50 xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] Xtest = np.vstack((xx.flatten(),yy.flatten())).T - zz,vv,phi = self.predict(Xtest,slices=which_functions,full_cov=full_cov) + zz,vv = self._raw_predict(Xtest,slices=which_functions,full_cov=False) zz = zz.reshape(resolution,resolution) pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) @@ -244,3 +227,10 @@ class GP(model): else: raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" + + def plot(self): + """ + Plot the data's view of the world, with non-normalised values and GP predictions passed through the likelihood + """ + pass# TODO!!!!! + diff --git a/GPy/models/GP_regression.py b/GPy/models/GP_regression.py index 72a24307..916e5284 100644 --- a/GPy/models/GP_regression.py +++ b/GPy/models/GP_regression.py @@ -1,18 +1,18 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Copyright (c) 2012, James Hensman # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -import pylab as pb +from GP import GP +from .. import likelihoods from .. import kern -from ..core import model -from ..util.linalg import pdinv,mdot -from ..util.plot import gpplot, Tango -class GP_regression(model): +class GP_regression(GP): """ Gaussian Process model for regression + This is a thin wrapper around the GP class, with a set of sensible defalts + :param X: input observations :param Y: observed values :param kernel: a GPy kernel, defaults to rbf+white @@ -29,199 +29,8 @@ class GP_regression(model): def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None): if kernel is None: - kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) + kernel = kern.rbf(X.shape[1]) - # parse arguments - self.Xslices = Xslices - assert isinstance(kernel, kern.kern) - self.kern = kernel - self.X = X - self.Y = Y - assert len(self.X.shape)==2 - assert len(self.Y.shape)==2 - assert self.X.shape[0] == self.Y.shape[0] - self.N, self.D = self.Y.shape - self.N, self.Q = self.X.shape + likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y) - #here's some simple normalisation - if normalize_X: - self._Xmean = X.mean(0)[None,:] - self._Xstd = X.std(0)[None,:] - self.X = (X.copy() - self._Xmean) / self._Xstd - if hasattr(self,'Z'): - self.Z = (self.Z - self._Xmean) / self._Xstd - else: - self._Xmean = np.zeros((1,self.X.shape[1])) - self._Xstd = np.ones((1,self.X.shape[1])) - - if normalize_Y: - self._Ymean = Y.mean(0)[None,:] - self._Ystd = Y.std(0)[None,:] - self.Y = (Y.copy()- self._Ymean) / self._Ystd - else: - self._Ymean = np.zeros((1,self.Y.shape[1])) - self._Ystd = np.ones((1,self.Y.shape[1])) - - if self.D > self.N: - # then it's more efficient to store YYT - self.YYT = np.dot(self.Y, self.Y.T) - else: - self.YYT = None - - model.__init__(self) - - def _set_params(self,p): - self.kern._set_params_transformed(p) - self.K = self.kern.K(self.X,slices1=self.Xslices) - self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) - - def _get_params(self): - return self.kern._get_params_transformed() - - def _get_param_names(self): - return self.kern._get_param_names_transformed() - - def _model_fit_term(self): - """ - Computes the model fit using YYT if it's available - """ - if self.YYT is None: - return -0.5*np.sum(np.square(np.dot(self.Li,self.Y))) - else: - return -0.5*np.sum(np.multiply(self.Ki, self.YYT)) - - def log_likelihood(self): - complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet - return complexity_term + self._model_fit_term() - - def dL_dK(self): - if self.YYT is None: - alpha = np.dot(self.Ki,self.Y) - dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) - else: - dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki) - - return dL_dK - - def _log_likelihood_gradients(self): - return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X) - - def predict(self,Xnew, slices=None, full_cov=False): - """ - - Predict the function(s) at the new point(s) Xnew. - - Arguments - --------- - :param Xnew: The points at which to make a prediction - :type Xnew: np.ndarray, Nnew x self.Q - :param slices: specifies which outputs kernel(s) the Xnew correspond to (see below) - :type slices: (None, list of slice objects, list of ints) - :param full_cov: whether to return the folll covariance matrix, or just the diagonal - :type full_cov: bool - :rtype: posterior mean, a Numpy array, Nnew x self.D - :rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D) - - .. Note:: "slices" specifies how the the points X_new co-vary wich the training points. - - - If None, the new points covary throigh every kernel part (default) - - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part - - If a list of booleans, specifying which kernel parts are active - - If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. - This is to allow for different normalisations of the output dimensions. - - - """ - - #normalise X values - Xnew = (Xnew.copy() - self._Xmean) / self._Xstd - mu, var = self._raw_predict(Xnew, slices, full_cov) - - #un-normalise - mu = mu*self._Ystd + self._Ymean - if full_cov: - if self.D==1: - var *= np.square(self._Ystd) - else: - var = var[:,:,None] * np.square(self._Ystd) - else: - if self.D==1: - var *= np.square(np.squeeze(self._Ystd)) - else: - var = var[:,None] * np.square(self._Ystd) - - return mu,var - - def _raw_predict(self,_Xnew,slices, full_cov=False): - """Internal helper function for making predictions, does not account for normalisation""" - Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) - mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) - KiKx = np.dot(self.Ki,Kx) - if full_cov: - Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) - var = Kxx - np.dot(KiKx.T,Kx) - else: - Kxx = self.kern.Kdiag(_Xnew, slices=slices) - var = Kxx - np.sum(np.multiply(KiKx,Kx),0) - return mu, var - - def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): - """ - :param samples: the number of a posteriori samples to plot - :param which_data: which if the training data to plot (default all) - :type which_data: 'all' or a slice object to slice self.X, self.Y - :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits - :param which_functions: which of the kernel functions to plot (additively) - :type which_functions: list of bools - :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D - - Plot the posterior of the GP. - - In one dimension, the function is plotted with a shaded region identifying two standard deviations. - - In two dimsensions, a contour-plot shows the mean predicted function - - In higher dimensions, we've no implemented this yet !TODO! - - Can plot only part of the data and part of the posterior functions using which_data and which_functions - """ - if which_functions=='all': - which_functions = [True]*self.kern.Nparts - if which_data=='all': - which_data = slice(None) - - X = self.X[which_data,:] - Y = self.Y[which_data,:] - - Xorig = X*self._Xstd + self._Xmean - Yorig = Y*self._Ystd + self._Ymean - if plot_limits is None: - xmin,xmax = Xorig.min(0),Xorig.max(0) - xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) - elif len(plot_limits)==2: - xmin, xmax = plot_limits - else: - raise ValueError, "Bad limits for plotting" - - - if self.X.shape[1]==1: - Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - m,v = self.predict(Xnew,slices=which_functions) - gpplot(Xnew,m,v) - if samples: - s = np.random.multivariate_normal(m.flatten(),v,samples) - pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - pb.plot(Xorig,Yorig,'kx',mew=1.5) - pb.xlim(xmin,xmax) - - elif self.X.shape[1]==2: - resolution = 50 or resolution - xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] - Xtest = np.vstack((xx.flatten(),yy.flatten())).T - zz,vv = self.predict(Xtest,slices=which_functions) - zz = zz.reshape(resolution,resolution) - pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) - pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) - pb.xlim(xmin[0],xmax[0]) - pb.ylim(xmin[1],xmax[1]) - - else: - raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" + GP.__init__(self, X, kernel, likelihood, normalize_X=normalize_X, Xslices=Xslices) diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index d2de84aa..1175eb71 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -2,14 +2,14 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) -#from GP_regression import GP_regression #from sparse_GP_regression import sparse_GP_regression -# ^^ remove these? +# TODO ^^ remove these? from GPLVM import GPLVM from warped_GP import warpedGP -from generalized_FITC import generalized_FITC -from sparse_GPLVM import sparse_GPLVM -from uncollapsed_sparse_GP import uncollapsed_sparse_GP +# TODO: from generalized_FITC import generalized_FITC +#from sparse_GPLVM import sparse_GPLVM +#from uncollapsed_sparse_GP import uncollapsed_sparse_GP from GP import GP -from sparse_GP import sparse_GP -from BGPLVM import Bayesian_GPLVM +from GP_regression import GP_regression +#from sparse_GP import sparse_GP +#from BGPLVM import Bayesian_GPLVM diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index 7f287174..7b043209 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -7,8 +7,6 @@ from ..util.linalg import mdot, jitchol, chol_inv, pdinv from ..util.plot import gpplot from .. import kern from GP import GP -from ..inference.EP import Full,DTC,FITC -from ..inference.likelihoods import likelihood,probit,poisson,gaussian #Still TODO: diff --git a/GPy/util/plot.py b/GPy/util/plot.py index 8c06633e..3b4682e4 100644 --- a/GPy/util/plot.py +++ b/GPy/util/plot.py @@ -6,7 +6,7 @@ import Tango import pylab as pb import numpy as np -def gpplot(x,mu,var,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.coloursHex['lightBlue'],axes=None,**kwargs): +def gpplot(x,mu,lower,upper,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.coloursHex['lightBlue'],axes=None,**kwargs): if axes is None: axes = pb.gca() mu = mu.flatten() @@ -15,21 +15,15 @@ def gpplot(x,mu,var,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.coloursHe #here's the mean axes.plot(x,mu,color=edgecol,linewidth=2) - #ensure variance is a vector - if len(var.shape)>1: - err = 2*np.sqrt(np.diag(var)) - else: - err = 2*np.sqrt(var) - - #here's the 2*std box + #here's the box kwargs['linewidth']=0.5 if not 'alpha' in kwargs.keys(): kwargs['alpha'] = 0.3 - axes.fill(np.hstack((x,x[::-1])),np.hstack((mu+err,mu[::-1]-err[::-1])),color=fillcol,**kwargs) + axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs) #this is the edge: - axes.plot(x,mu+err,color=edgecol,linewidth=0.2) - axes.plot(x,mu-err,color=edgecol,linewidth=0.2) + axes.plot(x,upper,color=edgecol,linewidth=0.2) + axes.plot(x,lower,color=edgecol,linewidth=0.2) def removeRightTicks(ax=None): ax = ax or pb.gca() From 2b40ee6f7e952b11405d6e2434995c68c9ac71da Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Thu, 31 Jan 2013 15:30:57 +0000 Subject: [PATCH 27/44] predictive_values implemented in EP --- GPy/likelihoods/EP.py | 11 ++++++++++- GPy/likelihoods/Gaussian.py | 10 ++++++++++ GPy/likelihoods/likelihood_functions.py | 8 ++++---- GPy/models/GP.py | 3 ++- 4 files changed, 26 insertions(+), 6 deletions(-) diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index 3e975436..b557a62f 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -18,7 +18,6 @@ class EP: self.likelihood_function = likelihood_function self.epsilon = epsilon self.eta, self.delta = power_ep - self.jitter = 1e-12 # TODO: is this needed? """ Initial values - Likelihood approximation parameters: @@ -27,6 +26,16 @@ class EP: self.tau_tilde = np.zeros(self.N) self.v_tilde = np.zeros(self.N) + def predictive_values(self,mu,var): + return self.likelihood_function.predictive_values(mu,var) + + def _get_params(self): + return np.zeros(0) + def _get_param_names(self): + return [] + def _set_params(self,p): + pass # TODO: the EP likelihood might want to take some parameters... + def _compute_GP_variables(self): #Variables to be called from GP mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index fe954b78..37132cf0 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -29,6 +29,16 @@ class Gaussian: self._variance = x self.variance = np.eye(self.N)*self._variance + def predictive_values(self,mu,var): + """ + Un-normalise the prediction and add the likelihood variance, then return the 5%, 95% interval + """ + mean = mu*self._std + self._mean + true_var = (var + self._variance)*self._std**2 + _5pc = mean + mean - 2.*np.sqrt(var) + _95pc = mean + 2.*np.sqrt(var) + return mean, _5pc, _95pc + def fit(self): """ No approximations needed diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index b94929d3..68fd276a 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -49,7 +49,7 @@ class probit(likelihood): def predictive_values(self,mu,var,all=False): """ - Compute mean, variance, and conficence interval (percentiles 5 and 95) of the prediction + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction """ mu = mu.flatten() var = var.flatten() @@ -57,7 +57,7 @@ class probit(likelihood): if all: p_05 = np.zeros([mu.size]) p_95 = np.ones([mu.size]) - return mean, mean*(1-mean),p_05,p_95 + return mean, p_05, p_95 else: return mean @@ -136,14 +136,14 @@ class poisson(likelihood): def predictive_values(self,mu,var,all=False): """ - Compute mean, variance, and conficence interval (percentiles 5 and 95) of the prediction + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction """ mean = np.exp(mu*self.scale + self.location) if all: tmp = stats.poisson.ppf(np.array([.05,.95]),mu) p_05 = tmp[:,0] p_95 = tmp[:,1] - return mean,mean,p_05,p_95 + return mean,p_05,p_95 else: return mean diff --git a/GPy/models/GP.py b/GPy/models/GP.py index b663ad3e..d20aa290 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -164,9 +164,10 @@ class GP(model): """ #normalise X values Xnew = (Xnew.copy() - self._Xmean) / self._Xstd - mu, var, phi = self._raw_predict(Xnew, slices, full_cov=full_cov) + mu, var = self._raw_predict(Xnew, slices, full_cov=full_cov) #now push through likelihood TODO + mean, _5pc, _95pc = self.likelihood.predictive_values(mu, var) return mean, _5pc, _95pc From 3a558d8244cc3a46a088d61cee3a7a1c743a875c Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 31 Jan 2013 15:30:58 +0000 Subject: [PATCH 28/44] merged conflict --- GPy/models/GP.py | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index b663ad3e..5964570a 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -29,7 +29,6 @@ class GP(model): .. Note:: Multiple independent outputs are allowed using columns of Y """ - #TODO: when using EP, predict needs to return 3 values otherwise it just needs 2. At the moment predict returns 3 values in any case. def __init__(self, X, kernel, likelihood, normalize_X=False, Xslices=None): @@ -164,7 +163,7 @@ class GP(model): """ #normalise X values Xnew = (Xnew.copy() - self._Xmean) / self._Xstd - mu, var, phi = self._raw_predict(Xnew, slices, full_cov=full_cov) + mu, var = self._raw_predict(Xnew, slices, full_cov=full_cov) #now push through likelihood TODO @@ -208,25 +207,9 @@ class GP(model): if self.X.shape[1]==1: Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] -<<<<<<< HEAD m,v = self._raw_predict(Xnew,slices=which_functions,full_cov=False) lower, upper = m.flatten() - 2.*np.sqrt(v) , m.flatten()+ 2.*np.sqrt(v) gpplot(Xnew,m,lower,upper) -======= - m,v = self.predict(Xnew,slices=which_functions,full_cov=full_cov) - if self.EP: - pb.subplot(211) - gpplot(Xnew,m,v) - if samples: #NOTE why don't we put samples as a parameter of gpplot - s = np.random.multivariate_normal(m.flatten(),np.diag(v.flatten()),samples) - pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - pb.plot(Xorig,Yorig,'kx',mew=1.5) - pb.xlim(xmin,xmax) - - if self.EP: - phi_m, phi_v, phi_l, phi_u = self.likelihood.predictive_values(m,v) ->>>>>>> 9feae765dc2253edaa37b25e3417a364e5b9acdc - pb.plot(X,Y,'kx',mew=1.5) pb.xlim(xmin,xmax) elif self.X.shape[1]==2: From 7dfbcebb87edc6d752cd39b675d98deb5f005f54 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Fri, 1 Feb 2013 09:47:30 +0000 Subject: [PATCH 29/44] some tidying in the likelihood classes --- GPy/likelihoods/EP.py | 15 +- GPy/likelihoods/Gaussian.py | 5 +- GPy/likelihoods/likelihood_functions.py | 14 +- GPy/models/sparse_GP.py | 236 ++++++++-------------- GPy/models/sparse_GP_old.py | 258 ++++++++++++++++++++++++ GPy/models/sparse_GP_regression.py | 205 ------------------- 6 files changed, 364 insertions(+), 369 deletions(-) create mode 100644 GPy/models/sparse_GP_old.py delete mode 100644 GPy/models/sparse_GP_regression.py diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index 3e975436..c52fd8bf 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -18,12 +18,10 @@ class EP: self.likelihood_function = likelihood_function self.epsilon = epsilon self.eta, self.delta = power_ep - self.jitter = 1e-12 # TODO: is this needed? + self.is_heteroscedastic = True - """ - Initial values - Likelihood approximation parameters: - p(y|f) = t(f|tau_tilde,v_tilde) - """ + #Initial values - Likelihood approximation parameters: + #p(y|f) = t(f|tau_tilde,v_tilde) self.tau_tilde = np.zeros(self.N) self.v_tilde = np.zeros(self.N) @@ -32,8 +30,11 @@ class EP: mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model sigma_sum = 1./self.tau_ + 1./self.tau_tilde mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 - Z_ep = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant - self.Y, self.beta, self.Z = self.tau_tilde[:,None], mu_tilde[:,None], Z_ep + self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep + + self.Y = mu_tilde[:,None] + self.precsion = self.tau_tilde + self.covariance_matrix = np.diag(1./self.precision) def fit_full(self,K): """ diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index fe954b78..5b461537 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -2,6 +2,7 @@ import numpy as np class Gaussian: def __init__(self,data,variance=1.,normalize=False): + self.is_heteroscedastic = False self.data = data self.N,D = data.shape self.Z = 0. # a correction factor which accounts for the approximation made @@ -19,6 +20,7 @@ class Gaussian: self.YYT = np.dot(self.Y,self.Y.T) self._set_params(np.asarray(variance)) + def _get_params(self): return np.asarray(self._variance) @@ -27,7 +29,8 @@ class Gaussian: def _set_params(self,x): self._variance = x - self.variance = np.eye(self.N)*self._variance + self.covariance_matrix = np.eye(self.N)*self._variance + self.precision = 1./self._variance def fit(self): """ diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index b94929d3..756d9eb0 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -8,18 +8,18 @@ import scipy as sp import pylab as pb from ..util.plot import gpplot -class likelihood: +class likelihood_function: """ Likelihood class for doing Expectation propagation :param Y: observed output (Nx1 numpy.darray) - ..Note:: Y values allowed depend on the likelihood used + ..Note:: Y values allowed depend on the likelihood_function used """ def __init__(self,location=0,scale=1): self.location = location self.scale = scale -class probit(likelihood): +class probit(likelihood_function): """ Probit likelihood Y is expected to take values in {-1,1} @@ -29,7 +29,7 @@ class probit(likelihood): $$ """ def __init__(self,location=0,scale=1): - likelihood.__init__(self,Y,location,scale) + likelihood_function.__init__(self,Y,location,scale) def moments_match(self,data_i,tau_i,v_i): """ @@ -64,7 +64,7 @@ class probit(likelihood): def _log_likelihood_gradients(): return np.zeros(0) # there are no parameters of whcih to compute the gradients -class poisson(likelihood): +class poisson(likelihood_function): """ Poisson likelihood Y is expected to take values in {0,1,2,...} @@ -75,7 +75,7 @@ class poisson(likelihood): """ def __init__(self,Y,location=0,scale=1): assert len(Y[Y<0]) == 0, "Output cannot have negative values" - likelihood.__init__(self,Y,location,scale) + likelihood_function.__init__(self,Y,location,scale) def moments_match(self,i,tau_i,v_i): """ @@ -160,7 +160,7 @@ class poisson(likelihood): if Z is not None: pb.plot(Z,Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) -class gaussian(likelihood): +class gaussian(likelihood_function): """ Gaussian likelihood Y is expected to take values in (-inf,inf) diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index 7b043209..fe7bcc3b 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -6,37 +6,36 @@ import pylab as pb from ..util.linalg import mdot, jitchol, chol_inv, pdinv from ..util.plot import gpplot from .. import kern +from ..inference.likelihoods import likelihood from GP import GP - #Still TODO: # make use of slices properly (kernel can now do this) # enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) class sparse_GP(GP): """ - Variational sparse GP model (Regression) + Variational sparse GP model :param X: inputs :type X: np.ndarray (N x Q) - :param Y: observed data - :type Y: np.ndarray of observations (N x D) + :param likelihood: a likelihood instance, containing the observed data + :type likelihood: GPy.likelihood.(Gaussian | EP) :param kernel : the kernel/covariance function. See link kernels :type kernel: a GPy kernel - :param Z: inducing inputs (optional, see note) - :type Z: np.ndarray (M x Q) | None :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) :type X_uncertainty: np.ndarray (N x Q) | None + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None :param Zslices: slices for the inducing inputs (see slicing TODO: link) :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) :type M: int - :param beta: noise precision. TODO> ignore beta if doing EP - :type beta: float :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) :type normalize_(X|Y): bool """ - def __init__(self,X,Y=None,kernel=None,X_uncertainty=None,beta=100.,Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,power_ep=[1.,1.]): + def __init__(self,X,likelihood,kernel, X_uncertainty=None, Z=None,Zslices=None,M=10,normalize_X=False): + self.scale_factor = 1000.0# a scaling factor to help keep the algorithm stable if Z is None: self.Z = np.random.permutation(X.copy())[:M] @@ -52,140 +51,91 @@ class sparse_GP(GP): self.has_uncertain_inputs=True self.X_uncertainty = X_uncertainty - GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,power_ep=power_ep) + GP.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices) #normalise X uncertainty also if self.has_uncertain_inputs: self.X_uncertainty /= np.square(self._Xstd) - if not self.EP: - self.trYYT = np.sum(np.square(self.Y)) - else: - self.method_ep = method_ep - - #normalise X uncertainty also - if self.has_uncertain_inputs: - self.X_uncertainty /= np.square(self._Xstd) - - def _set_params(self, p): - self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) - if not self.EP: - self.beta = p[self.M*self.Q] - self.kern._set_params(p[self.Z.size + 1:]) - else: - self.kern._set_params(p[self.Z.size:]) - if self.Y is None: - self.Y = np.ones([self.N,1]) - self._compute_kernel_matrices() - self._computations() - - def _get_params(self): - if not self.EP: - return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()]) - else: - return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()]) - - def _get_param_names(self): - if not self.EP: - return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed() - else: - return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed() - - - def _compute_kernel_matrices(self): - # kernel computations, using BGPLVM notation - #TODO: slices for psi statistics (easy enough) - - self.Kmm = self.kern.K(self.Z) - if self.has_uncertain_inputs: - if not self.EP: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)#.sum() NOTE psi0 is now a vector - self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) - #self.psi2_beta_scaled = ? - else: - raise NotImplementedError, "uncertain_inputs not yet supported for EP" - else: - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)#.sum() - self.psi1 = self.kern.K(self.Z,self.X) - self.psi2 = np.dot(self.psi1,self.psi1.T) - self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T) - def _computations(self): # TODO find routine to multiply triangular matrices - self.V = self.beta*self.Y + #TODO: slices for psi statistics (easy enough) + + sf = self.scale_factor + sf2 = sf**2 + + # kernel computations, using BGPLVM notation + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + self.psi2_beta_scaled = (self.psi2*(self.beta/sf2)).sum(0) + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() + self.psi1 = self.kern.K(self.Z,self.X) + tmp = self.psi1*(np.sqrt(self.likelihood.beta)/sf) + self.psi2_beta_scaled = np.dot(tmp,tmp.T) + + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)#+np.eye(self.M)*1e-3) + + self.V = (self.likelihood.beta/self.scale_factor)*self.Y + self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) + self.B = np.eye(self.M)/sf2 + self.A + + self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) + self.psi1V = np.dot(self.psi1, self.V) self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) - self.B = np.eye(self.M) + self.A - self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) - self.LLambdai = np.dot(self.LBi, self.Lmi) - self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) - self.C = mdot(self.LLambdai, self.psi1V) - self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) - self.trace_K_beta_scaled = (self.psi0*self.beta).sum() - np.trace(self.A) - if not self.EP: - self.trace_K = self.psi0.sum() - np.trace(self.A)/self.beta + self.C = mdot(self.Lmi.T, self.Bi, self.Lmi) + self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T) - # Compute dL_dpsi - self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) - if not self.EP: - self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) - if self.has_uncertain_inputs: - self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) - else: - self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) - else: - self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() - if not self.has_uncertain_inputs: - self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + # Compute dL_dpsi # FIXME + self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T + self.dL_dpsi2 = 0.5 * self.beta * self.D * self.Kmmi[None,:,:] # dB + self.dL_dpsi2 += - 0.5 * self.beta/sf2 * self.D * self.C[None,:,:] # dC + self.dL_dpsi2 += - 0.5 * self.beta * self.E[None,:,:] # dD # Compute dL_dKmm - self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB - self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*mdot(self.LBL_inv, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC - self.dL_dKmm += np.dot(np.dot(self.G,self.psi2_beta_scaled) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE + self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB + self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD - def approximate_likelihood(self): - assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" - if self.method_ep == 'DTC': - self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) - elif self.method_ep == 'FITC': - self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.psi0,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) - else: - self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) - self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() - self.trbetaYYT = np.sum(np.square(self.Y)*self.beta) + + def _set_params(self, p): + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) + self.beta = p[self.M*self.Q] # FIXME + self.kern._set_params(p[self.Z.size + 1:]) self._computations() + def _get_params(self): + return np.hstack([self.Z.flatten(),GP._get_params(self)) + + def _get_param_names(self): + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + GP._get_param_names(self) + def log_likelihood(self): - """ - Compute the (lower bound on the) log marginal likelihood - """ - if not self.EP: - A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) - D = -0.5*self.beta*self.trYYT - else: - A = -0.5*self.D*(self.N*np.log(2.*np.pi) - np.sum(np.log(self.beta))) - D = -0.5*self.trbetaYYT - B = -0.5*self.D*self.trace_K_beta_scaled - C = -0.5*self.D * self.B_logdet - E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) - return A+B+C+D+E + """ Compute the (lower bound on the) log marginal likelihood """ + sf2 = self.scale_factor**2 + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) -0.5*self.beta*self.trYYT # FIXME + B = -0.5*self.D*(self.beta*self.psi0-np.trace(self.A)*sf2)# FIXME + C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2)) + D = +0.5*np.sum(self.psi1VVpsi1 * self.C) + return A+B+C+D + def _log_likelihood_gradients(self): + return np.hstack([self.dL_dZ().flatten(), GP._log_likelihood_gradients(self)]) + + # FIXME: move this into the lieklihood class def dL_dbeta(self): - """ - Compute the gradient of the log likelihood wrt beta. - """ - #TODO: suport heteroscedatic noise - dA_dbeta = 0.5 * self.N*self.D/self.beta - dB_dbeta = - 0.5 * self.D * self.trace_K + sf2 = self.scale_factor**2 + dA_dbeta = 0.5 * self.N*self.D/self.beta - 0.5 * self.trYYT + dB_dbeta = - 0.5 * self.D * (self.psi0 - np.trace(self.A)/self.beta*sf2) dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta - dD_dbeta = - 0.5 * self.trYYT - tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V) - dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta + dD_dbeta = np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/self.beta - return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta) + return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta) def dL_dtheta(self): """ @@ -195,10 +145,10 @@ class sparse_GP(GP): if self.has_uncertain_inputs: dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) - dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty) else: #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1) #dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1) dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) @@ -208,48 +158,36 @@ class sparse_GP(GP): """ The derivative of the bound wrt the inducing inputs Z """ - dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ + dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ if self.has_uncertain_inputs: - dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) - dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) + dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty) + dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes' else: #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1)#dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1) dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) return dL_dZ - def _log_likelihood_gradients(self): - if not self.EP: - return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) - else: - return np.hstack([self.dL_dZ().flatten(), self.dL_dtheta()]) - def _raw_predict(self, Xnew, slices, full_cov=False): """Internal helper function for making predictions, does not account for normalisation""" + Kx = self.kern.K(self.Z, Xnew) - mu = mdot(Kx.T, self.LBL_inv, self.psi1V) - phi = None + mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V) + if full_cov: Kxx = self.kern.K(Xnew) - var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) - if not self.EP: - var += np.eye(Xnew.shape[0])/self.beta - else: - raise NotImplementedError, "full_cov = True not implemented for EP" + var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) else: Kxx = self.kern.Kdiag(Xnew) - var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) - if not self.EP: - var += 1./self.beta - else: - phi = self.likelihood.predictive_mean(mu,var) - return mu,var,phi + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + + return mu,var def plot(self, *args, **kwargs): """ Plot the fitted model: just call the GP_regression plot function and then add inducing inputs """ - GP.plot(self,*args,**kwargs) + GP_regression.plot(self,*args,**kwargs) if self.Q==1: pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) if self.has_uncertain_inputs: diff --git a/GPy/models/sparse_GP_old.py b/GPy/models/sparse_GP_old.py new file mode 100644 index 00000000..7b043209 --- /dev/null +++ b/GPy/models/sparse_GP_old.py @@ -0,0 +1,258 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, pdinv +from ..util.plot import gpplot +from .. import kern +from GP import GP + + +#Still TODO: +# make use of slices properly (kernel can now do this) +# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) + +class sparse_GP(GP): + """ + Variational sparse GP model (Regression) + + :param X: inputs + :type X: np.ndarray (N x Q) + :param Y: observed data + :type Y: np.ndarray of observations (N x D) + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) + :type X_uncertainty: np.ndarray (N x Q) | None + :param Zslices: slices for the inducing inputs (see slicing TODO: link) + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param beta: noise precision. TODO> ignore beta if doing EP + :type beta: float + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self,X,Y=None,kernel=None,X_uncertainty=None,beta=100.,Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,power_ep=[1.,1.]): + + if Z is None: + self.Z = np.random.permutation(X.copy())[:M] + self.M = M + else: + assert Z.shape[1]==X.shape[1] + self.Z = Z + self.M = Z.shape[0] + if X_uncertainty is None: + self.has_uncertain_inputs=False + else: + assert X_uncertainty.shape==X.shape + self.has_uncertain_inputs=True + self.X_uncertainty = X_uncertainty + + GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,power_ep=power_ep) + + #normalise X uncertainty also + if self.has_uncertain_inputs: + self.X_uncertainty /= np.square(self._Xstd) + + if not self.EP: + self.trYYT = np.sum(np.square(self.Y)) + else: + self.method_ep = method_ep + + #normalise X uncertainty also + if self.has_uncertain_inputs: + self.X_uncertainty /= np.square(self._Xstd) + + def _set_params(self, p): + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) + if not self.EP: + self.beta = p[self.M*self.Q] + self.kern._set_params(p[self.Z.size + 1:]) + else: + self.kern._set_params(p[self.Z.size:]) + if self.Y is None: + self.Y = np.ones([self.N,1]) + self._compute_kernel_matrices() + self._computations() + + def _get_params(self): + if not self.EP: + return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()]) + else: + return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()]) + + def _get_param_names(self): + if not self.EP: + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed() + else: + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed() + + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + #TODO: slices for psi statistics (easy enough) + + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + if not self.EP: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)#.sum() NOTE psi0 is now a vector + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + #self.psi2_beta_scaled = ? + else: + raise NotImplementedError, "uncertain_inputs not yet supported for EP" + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)#.sum() + self.psi1 = self.kern.K(self.Z,self.X) + self.psi2 = np.dot(self.psi1,self.psi1.T) + self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T) + + def _computations(self): + # TODO find routine to multiply triangular matrices + self.V = self.beta*self.Y + self.psi1V = np.dot(self.psi1, self.V) + self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) + self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) + self.B = np.eye(self.M) + self.A + self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) + self.LLambdai = np.dot(self.LBi, self.Lmi) + self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) + self.C = mdot(self.LLambdai, self.psi1V) + self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) + self.trace_K_beta_scaled = (self.psi0*self.beta).sum() - np.trace(self.A) + if not self.EP: + self.trace_K = self.psi0.sum() - np.trace(self.A)/self.beta + + # Compute dL_dpsi + self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) + if not self.EP: + self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + if self.has_uncertain_inputs: + self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + else: + self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + else: + self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() + if not self.has_uncertain_inputs: + self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + + # Compute dL_dKmm + self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB + self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*mdot(self.LBL_inv, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += np.dot(np.dot(self.G,self.psi2_beta_scaled) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE + + def approximate_likelihood(self): + assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" + if self.method_ep == 'DTC': + self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + elif self.method_ep == 'FITC': + self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.psi0,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + else: + self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() + self.trbetaYYT = np.sum(np.square(self.Y)*self.beta) + self._computations() + + def log_likelihood(self): + """ + Compute the (lower bound on the) log marginal likelihood + """ + if not self.EP: + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) + D = -0.5*self.beta*self.trYYT + else: + A = -0.5*self.D*(self.N*np.log(2.*np.pi) - np.sum(np.log(self.beta))) + D = -0.5*self.trbetaYYT + B = -0.5*self.D*self.trace_K_beta_scaled + C = -0.5*self.D * self.B_logdet + E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) + return A+B+C+D+E + + def dL_dbeta(self): + """ + Compute the gradient of the log likelihood wrt beta. + """ + #TODO: suport heteroscedatic noise + dA_dbeta = 0.5 * self.N*self.D/self.beta + dB_dbeta = - 0.5 * self.D * self.trace_K + dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta + dD_dbeta = - 0.5 * self.trYYT + tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V) + dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta + + return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta) + + def dL_dtheta(self): + """ + Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel + """ + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1) #dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) + + return dL_dtheta + + def dL_dZ(self): + """ + The derivative of the bound wrt the inducing inputs Z + """ + dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ + if self.has_uncertain_inputs: + dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1)#dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) + return dL_dZ + + def _log_likelihood_gradients(self): + if not self.EP: + return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) + else: + return np.hstack([self.dL_dZ().flatten(), self.dL_dtheta()]) + + def _raw_predict(self, Xnew, slices, full_cov=False): + """Internal helper function for making predictions, does not account for normalisation""" + Kx = self.kern.K(self.Z, Xnew) + mu = mdot(Kx.T, self.LBL_inv, self.psi1V) + phi = None + if full_cov: + Kxx = self.kern.K(Xnew) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + if not self.EP: + var += np.eye(Xnew.shape[0])/self.beta + else: + raise NotImplementedError, "full_cov = True not implemented for EP" + else: + Kxx = self.kern.Kdiag(Xnew) + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) + if not self.EP: + var += 1./self.beta + else: + phi = self.likelihood.predictive_mean(mu,var) + return mu,var,phi + + def plot(self, *args, **kwargs): + """ + Plot the fitted model: just call the GP_regression plot function and then add inducing inputs + """ + GP.plot(self,*args,**kwargs) + if self.Q==1: + pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) + if self.has_uncertain_inputs: + pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten())) + if self.Q==2: + pb.plot(self.Z[:,0],self.Z[:,1],'wo') diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py deleted file mode 100644 index 07ce4d97..00000000 --- a/GPy/models/sparse_GP_regression.py +++ /dev/null @@ -1,205 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -import numpy as np -import pylab as pb -from ..util.linalg import mdot, jitchol, chol_inv, pdinv -from ..util.plot import gpplot -from .. import kern -from ..inference.likelihoods import likelihood -from GP_regression import GP_regression - -#Still TODO: -# make use of slices properly (kernel can now do this) -# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) - -class sparse_GP_regression(GP_regression): - """ - Variational sparse GP model (Regression) - - :param X: inputs - :type X: np.ndarray (N x Q) - :param Y: observed data - :type Y: np.ndarray of observations (N x D) - :param kernel : the kernel/covariance function. See link kernels - :type kernel: a GPy kernel - :param Z: inducing inputs (optional, see note) - :type Z: np.ndarray (M x Q) | None - :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) - :type X_uncertainty: np.ndarray (N x Q) | None - :param Zslices: slices for the inducing inputs (see slicing TODO: link) - :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) - :type M: int - :param beta: noise precision. TODO> ignore beta if doing EP - :type beta: float - :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) - :type normalize_(X|Y): bool - """ - - def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False): - self.scale_factor = 1000.0 - self.beta = beta - if Z is None: - self.Z = np.random.permutation(X.copy())[:M] - self.M = M - else: - assert Z.shape[1]==X.shape[1] - self.Z = Z - self.M = Z.shape[0] - if X_uncertainty is None: - self.has_uncertain_inputs=False - else: - assert X_uncertainty.shape==X.shape - self.has_uncertain_inputs=True - self.X_uncertainty = X_uncertainty - - GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y) - self.trYYT = np.sum(np.square(self.Y)) - - #normalise X uncertainty also - if self.has_uncertain_inputs: - self.X_uncertainty /= np.square(self._Xstd) - - def _computations(self): - # TODO find routine to multiply triangular matrices - #TODO: slices for psi statistics (easy enough) - - # kernel computations, using BGPLVM notation - self.Kmm = self.kern.K(self.Z) - if self.has_uncertain_inputs: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() - self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) - self.psi2_beta_scaled = (self.psi2*(self.beta/self.scale_factor**2)).sum(0) - else: - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() - self.psi1 = self.kern.K(self.Z,self.X) - #self.psi2 = np.dot(self.psi1,self.psi1.T) - #self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:] - tmp = self.psi1/(self.scale_factor/np.sqrt(self.beta)) - self.psi2_beta_scaled = np.dot(tmp,tmp.T) - - sf = self.scale_factor - sf2 = sf**2 - - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)#+np.eye(self.M)*1e-3) - - self.V = (self.beta/self.scale_factor)*self.Y - self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) - self.B = np.eye(self.M)/sf2 + self.A - - self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) - - self.psi1V = np.dot(self.psi1, self.V) - self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) - self.C = mdot(self.Lmi.T, self.Bi, self.Lmi) - self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T) - - # Compute dL_dpsi - self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) - self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T - self.dL_dpsi2 = 0.5 * self.beta * self.D * self.Kmmi[None,:,:] # dB - self.dL_dpsi2 += - 0.5 * self.beta/sf2 * self.D * self.C[None,:,:] # dC - self.dL_dpsi2 += - 0.5 * self.beta * self.E[None,:,:] # dD - - # Compute dL_dKmm - self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB - self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC - self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD - - - def _set_params(self, p): - self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) - self.beta = p[self.M*self.Q] - self.kern._set_params(p[self.Z.size + 1:]) - self._computations() - - def _get_params(self): - return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()]) - - def _get_param_names(self): - return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed() - - - def log_likelihood(self): - """ Compute the (lower bound on the) log marginal likelihood """ - sf2 = self.scale_factor**2 - A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) -0.5*self.beta*self.trYYT - B = -0.5*self.D*(self.beta*self.psi0-np.trace(self.A)*sf2) - C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2)) - D = +0.5*np.sum(self.psi1VVpsi1 * self.C) - return A+B+C+D - - def _log_likelihood_gradients(self): - return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) - - def dL_dbeta(self): - """ - Compute the gradient of the log likelihood wrt beta. - """ - #TODO: suport heteroscedatic noise - sf2 = self.scale_factor**2 - dA_dbeta = 0.5 * self.N*self.D/self.beta - 0.5 * self.trYYT - dB_dbeta = - 0.5 * self.D * (self.psi0 - np.trace(self.A)/self.beta*sf2) - dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta - dD_dbeta = np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/self.beta - - return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta) - - def dL_dtheta(self): - """ - Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel - """ - dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) - if self.has_uncertain_inputs: - dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) - dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) - dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape - else: - #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1) - dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) - dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) - - return dL_dtheta - - def dL_dZ(self): - """ - The derivative of the bound wrt the inducing inputs Z - """ - dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ - if self.has_uncertain_inputs: - dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty) - dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes' - else: - #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1) - dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) - return dL_dZ - - def _raw_predict(self, Xnew, slices, full_cov=False): - """Internal helper function for making predictions, does not account for normalisation""" - - Kx = self.kern.K(self.Z, Xnew) - mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V) - - if full_cov: - Kxx = self.kern.K(Xnew) - var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case. - else: - Kxx = self.kern.Kdiag(Xnew) - var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + 1./self.beta # TODO: This beta doesn't belong here in the EP case. - - return mu,var - - def plot(self, *args, **kwargs): - """ - Plot the fitted model: just call the GP_regression plot function and then add inducing inputs - """ - GP_regression.plot(self,*args,**kwargs) - if self.Q==1: - pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) - if self.has_uncertain_inputs: - pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten())) - if self.Q==2: - pb.plot(self.Z[:,0],self.Z[:,1],'wo') From 346f9dd8bd3207959b87ded258e55aeb094f1ea3 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Fri, 1 Feb 2013 10:05:22 +0000 Subject: [PATCH 30/44] added a likelihood atom class and also some import tidying in the EP.py file --- GPy/likelihoods/EP.py | 12 ++++-------- GPy/likelihoods/Gaussian.py | 3 ++- GPy/likelihoods/likelihood.py | 35 +++++++++++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 9 deletions(-) create mode 100644 GPy/likelihoods/likelihood.py diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index c52fd8bf..ff612a6d 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -1,11 +1,9 @@ import numpy as np -import random from scipy import stats, linalg -from ..core import model from ..util.linalg import pdinv,mdot,jitchol -from ..util.plot import gpplot +from likelihood import likelihood -class EP: +class EP(likelihood): def __init__(self,data,likelihood_function,epsilon=1e-3,power_ep=[1.,1.]): """ Expectation Propagation @@ -70,8 +68,7 @@ class EP: self.np1 = [self.tau_tilde.copy()] self.np2 = [self.v_tilde.copy()] while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.arange(self.N) - random.shuffle(update_order) + update_order = np.random.permutation(self.N) for i in update_order: #Cavity distribution parameters self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i] @@ -243,8 +240,7 @@ class EP: self.np1 = [self.tau_tilde.copy()] self.np2 = [self.v_tilde.copy()] while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.arange(self.N) - random.shuffle(update_order) + update_order = np.random.permutation(self.N) for i in update_order: #Cavity distribution parameters self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i] diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index 5b461537..cb040d50 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -1,6 +1,7 @@ import numpy as np +from likelihood import likelihood -class Gaussian: +class Gaussian(likelihood): def __init__(self,data,variance=1.,normalize=False): self.is_heteroscedastic = False self.data = data diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py new file mode 100644 index 00000000..6ec57c07 --- /dev/null +++ b/GPy/likelihoods/likelihood.py @@ -0,0 +1,35 @@ +import numpy as np + +class likelihood: + """ + The atom for a likelihood class + + This object interfaces the GP and the data. The most basic likelihood + (Gaussian) inherits directly from this, as does the EP algorithm + + Some things must be defined for this to work properly: + self.Y : the effective Gaussian target of the GP + self.N, self.D : Y.shape + self.covariance_matrix : the effective (noise) covariance of the GP targets + self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP) + self.is_heteroscedastic : enables significant computational savings in GP + self.precision : a scalar or vector representation of the effective target precision + self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N + """ + def __init__(self,data): + raise ValueError, "this class is not to be instantiated" + + def _get_params(self): + raise NotImplementedError + + def _get_param_names(self): + raise NotImplementedError + + def _set_params(self,x): + raise NotImplementedError + + def fit(self): + raise NotImplementedError + + def _gradients(self,partial): + raise NotImplementedError From 182c4c7d64d9e85191f0aac45e67c77857b63cf7 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Fri, 1 Feb 2013 13:17:17 +0000 Subject: [PATCH 31/44] So many changes --- GPy/core/model.py | 9 +-- GPy/examples/ep_fix.py | 50 +++++++------- GPy/likelihoods/EP.py | 31 ++++++--- GPy/likelihoods/Gaussian.py | 2 +- GPy/likelihoods/likelihood_functions.py | 74 ++++---------------- GPy/models/GP.py | 89 ++++++++++++++----------- GPy/util/plot.py | 2 + 7 files changed, 118 insertions(+), 139 deletions(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index c7b61a32..f26bf2ee 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -10,6 +10,7 @@ from parameterised import parameterised, truncate_pad import priors from ..util.linalg import jitchol from ..inference import optimization +from .. import likelihoods class model(parameterised): def __init__(self): @@ -401,7 +402,7 @@ class model(parameterised): :type optimzer: string TODO: valid strings? """ - assert self.EP, "EM is not available for gaussian likelihood" + assert isinstance(self.likelihood,likelihoods.EP), "EM is not available for Gaussian likelihoods" log_change = epsilon + 1. self.log_likelihood_record = [] self.gp_params_record = [] @@ -410,18 +411,18 @@ class model(parameterised): last_value = -np.exp(1000) while log_change > epsilon or not iteration: print 'EM iteration %s' %iteration - self.approximate_likelihood() + self.update_likelihood_approximation() self.optimize(**kwargs) new_value = self.log_likelihood() log_change = new_value - last_value if log_change > epsilon: self.log_likelihood_record.append(new_value) self.gp_params_record.append(self._get_params()) - self.ep_params_record.append((self.beta,self.Y,self.Z_ep)) + #self.ep_params_record.append((self.beta,self.Y,self.Z_ep)) last_value = new_value else: convergence = False - self.beta, self.Y, self.Z_ep = self.ep_params_record[-1] + #self.beta, self.Y, self.Z_ep = self.ep_params_record[-1] self._set_params(self.gp_params_record[-1]) print "Log-likelihood decrement: %s \nLast iteration discarded." %log_change iteration += 1 diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index 8041cc91..83a58bf8 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -3,7 +3,8 @@ """ -Simple Gaussian Processes classification +Simple Gaussian Processes classification 1D +Probit likelihood """ import pylab as pb import numpy as np @@ -12,28 +13,31 @@ pb.ion() pb.close('all') -model_type='Full' -inducing=4 -"""Simple 1D classification example. -:param model_type: type of model to fit ['Full', 'FITC', 'DTC']. -:param seed : seed value for data generation (default is 4). -:type seed: int -:param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). -:type inducing: int -""" -data = GPy.util.datasets.toy_linear_1d_classification(seed=0) -likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) +# Inputs +N = 30 +X1 = np.random.normal(5,2,N/2) +X2 = np.random.normal(10,2,N/2) +X = np.hstack([X1,X2])[:,None] -m = GPy.models.GP(data['X'],likelihood=likelihood) -#m = GPy.models.GP(data['X'],likelihood.Y) +# Outputs +Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None] + +# Kernel object +kernel = GPy.kern.rbf(1) + +# Define likelihood +distribution = GPy.likelihoods.likelihood_functions.Probit() +likelihood_object = GPy.likelihoods.EP(Y,distribution) + +# Model definition +m = GPy.models.GP(X,kernel,likelihood=likelihood_object) m.ensure_default_constraints() +m.update_likelihood_approximation() +#m.checkgrad(verbose=1) +m.optimize() +print "Round 2" +m.update_likelihood_approximation() -# Optimize and plot -#if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): -# m.approximate_likelihood() -#m.optimize() -m.EM() - -print m.log_likelihood() -m.plot(samples=3) -print(m) +#m.EPEM() +#m.plot() +#print(m) diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index b557a62f..bec81436 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -1,7 +1,7 @@ import numpy as np import random from scipy import stats, linalg -from ..core import model +#from ..core import model from ..util.linalg import pdinv,mdot,jitchol from ..util.plot import gpplot @@ -18,6 +18,8 @@ class EP: self.likelihood_function = likelihood_function self.epsilon = epsilon self.eta, self.delta = power_ep + self.data = data + self.N = self.data.size """ Initial values - Likelihood approximation parameters: @@ -26,6 +28,12 @@ class EP: self.tau_tilde = np.zeros(self.N) self.v_tilde = np.zeros(self.N) + #initial values for the GP variables + self.Y = np.zeros((self.N,1)) + self.variance = np.zeros((self.N,self.N))#np.eye(self.N) + self.Z = 0 + self.YYT = None + def predictive_values(self,mu,var): return self.likelihood_function.predictive_values(mu,var) @@ -35,6 +43,8 @@ class EP: return [] def _set_params(self,p): pass # TODO: the EP likelihood might want to take some parameters... + def _gradients(self,partial): + return np.zeros(0) # TODO: the EP likelihood might want to take some parameters... def _compute_GP_variables(self): #Variables to be called from GP @@ -42,7 +52,8 @@ class EP: sigma_sum = 1./self.tau_ + 1./self.tau_tilde mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 Z_ep = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant - self.Y, self.beta, self.Z = self.tau_tilde[:,None], mu_tilde[:,None], Z_ep + self.Y, self.beta, self.Z = mu_tilde[:,None],self.tau_tilde[:,None], Z_ep + self.variance = np.diag(1./self.beta.flatten()) def fit_full(self,K): """ @@ -53,7 +64,7 @@ class EP: #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) self.mu = np.zeros(self.N) - self.Sigma = K.copy() + self.Sigma = K.copy() - self.variance.copy() """ Initial values - Cavity distribution parameters: @@ -78,14 +89,14 @@ class EP: self.np1 = [self.tau_tilde.copy()] self.np2 = [self.v_tilde.copy()] while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.arange(self.N) - random.shuffle(update_order) + update_order = np.random.permutation(self.N) for i in update_order: #Cavity distribution parameters self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i] self.v_[i] = self.mu[i]/self.Sigma[i,i] - self.eta*self.v_tilde[i] + print 1./self.Sigma[i,i],self.tau_tilde[i] #Marginal moments - self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],self.tau_[i],self.v_[i]) #Site parameters update Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma[i,i]) Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma[i,i]) @@ -96,6 +107,7 @@ class EP: self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T) self.mu = np.dot(self.Sigma,self.v_tilde) self.iterations += 1 + print self.tau_tilde[i] #Sigma recomptutation with Cholesky decompositon Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K @@ -116,7 +128,7 @@ class EP: For nomenclature see ... 2013. """ - #TODO: this doesn;t work with uncertain inputs! + #TODO: this doesn;t work with uncertain inputs! """ Prior approximation parameters: @@ -251,14 +263,13 @@ class EP: self.np1 = [self.tau_tilde.copy()] self.np2 = [self.v_tilde.copy()] while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.arange(self.N) - random.shuffle(update_order) + update_order = np.random.permutation(self.N) for i in update_order: #Cavity distribution parameters self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i] self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i] #Marginal moments - self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(data[i],self.tau_[i],self.v_[i]) #Site parameters update Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i]) Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i]) diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index 37132cf0..eec833b8 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -39,7 +39,7 @@ class Gaussian: _95pc = mean + 2.*np.sqrt(var) return mean, _5pc, _95pc - def fit(self): + def fit_full(self): """ No approximations needed """ diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 68fd276a..e153ce15 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -7,6 +7,7 @@ from scipy import stats import scipy as sp import pylab as pb from ..util.plot import gpplot +#from . import EP class likelihood: """ @@ -19,7 +20,7 @@ class likelihood: self.location = location self.scale = scale -class probit(likelihood): +class Probit(likelihood): """ Probit likelihood Y is expected to take values in {-1,1} @@ -28,8 +29,6 @@ class probit(likelihood): L(x) = \\Phi (Y_i*f_i) $$ """ - def __init__(self,location=0,scale=1): - likelihood.__init__(self,Y,location,scale) def moments_match(self,data_i,tau_i,v_i): """ @@ -47,24 +46,18 @@ class probit(likelihood): sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat) return Z_hat, mu_hat, sigma2_hat - def predictive_values(self,mu,var,all=False): + def predictive_values(self,mu,var): """ Compute mean, and conficence interval (percentiles 5 and 95) of the prediction """ mu = mu.flatten() var = var.flatten() mean = stats.norm.cdf(mu/np.sqrt(1+var)) - if all: - p_05 = np.zeros([mu.size]) - p_95 = np.ones([mu.size]) - return mean, p_05, p_95 - else: - return mean + p_05 = np.zeros([mu.size]) + p_95 = np.ones([mu.size]) + return mean, p_05, p_95 - def _log_likelihood_gradients(): - return np.zeros(0) # there are no parameters of whcih to compute the gradients - -class poisson(likelihood): +class Poisson(likelihood): """ Poisson likelihood Y is expected to take values in {0,1,2,...} @@ -73,9 +66,6 @@ class poisson(likelihood): L(x) = \exp(\lambda) * \lambda**Y_i / Y_i! $$ """ - def __init__(self,Y,location=0,scale=1): - assert len(Y[Y<0]) == 0, "Output cannot have negative values" - likelihood.__init__(self,Y,location,scale) def moments_match(self,i,tau_i,v_i): """ @@ -134,52 +124,12 @@ class poisson(likelihood): sigma2_hat = m2 - mu_hat**2 # Second central moment return float(Z_hat), float(mu_hat), float(sigma2_hat) - def predictive_values(self,mu,var,all=False): + def predictive_values(self,mu,var): """ Compute mean, and conficence interval (percentiles 5 and 95) of the prediction """ mean = np.exp(mu*self.scale + self.location) - if all: - tmp = stats.poisson.ppf(np.array([.05,.95]),mu) - p_05 = tmp[:,0] - p_95 = tmp[:,1] - return mean,p_05,p_95 - else: - return mean - - def _log_likelihood_gradients(): - raise NotImplementedError - - def plot(self,X,mu,var,phi,X_obs,Z=None,samples=0): - assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X,phi,phi.flatten()) - pb.plot(X_obs,self.Y,'kx',mew=1.5) - if samples: - phi_samples = np.vstack([np.random.poisson(phi.flatten(),phi.size) for s in range(samples)]) - pb.plot(X,phi_samples.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - if Z is not None: - pb.plot(Z,Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) - -class gaussian(likelihood): - """ - Gaussian likelihood - Y is expected to take values in (-inf,inf) - """ - def moments_match(self,i,tau_i,v_i): - """ - Moments match of the marginal approximation in EP algorithm - - :param i: number of observation (int) - :param tau_i: precision of the cavity distribution (float) - :param v_i: mean/variance of the cavity distribution (float) - """ - mu = v_i/tau_i - sigma = np.sqrt(1./tau_i) - s = 1. if self.Y[i] == 0 else 1./self.Y[i] - sigma2_hat = 1./(1./sigma**2 + 1./s**2) - mu_hat = sigma2_hat*(mu/sigma**2 + self.Y[i]/s**2) - Z_hat = 1./np.sqrt(2*np.pi) * 1./np.sqrt(sigma**2+s**2) * np.exp(-.5*(mu-self.Y[i])**2/(sigma**2 + s**2)) - return Z_hat, mu_hat, sigma2_hat - - def _log_likelihood_gradients(): - raise NotImplementedError + tmp = stats.poisson.ppf(np.array([.05,.95]),mu) + p_05 = tmp[:,0] + p_95 = tmp[:,1] + return mean,p_05,p_95 diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 793e2585..49c22364 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -8,6 +8,7 @@ from .. import kern from ..core import model from ..util.linalg import pdinv,mdot from ..util.plot import gpplot, Tango +from ..likelihoods import EP class GP(model): """ @@ -52,8 +53,10 @@ class GP(model): self._Xstd = np.ones((1,self.X.shape[1])) self.likelihood = likelihood - assert self.X.shape[0] == self.likelihood.Y.shape[0] - self.N, self.D = self.likelihood.Y.shape + #assert self.X.shape[0] == self.likelihood.Y.shape[0] + #self.N, self.D = self.likelihood.Y.shape + assert self.X.shape[0] == self.likelihood.data.shape[0] + self.N, self.D = self.likelihood.data.shape model.__init__(self) @@ -87,7 +90,11 @@ class GP(model): For a Gaussian (or direct: TODO) likelihood, no iteration is required: this function does nothing """ - self.likelihood.fit(self.K) + self.likelihood.fit_full(self.K) + # Recompute K + noise_term + self.K = self.kern.K(self.X,slices1=self.Xslices) + self.K += self.likelihood.variance + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) def _model_fit_term(self): """ @@ -119,7 +126,7 @@ class GP(model): """ return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK,X=self.X), self.likelihood._gradients(partial=self.dL_dK))) - def _raw_predict(self,_Xnew,slices, full_cov=False): + def _raw_predict(self,_Xnew,slices=None, full_cov=False): """ Internal helper function for making predictions, does not account for normalisation or likelihood @@ -129,11 +136,11 @@ class GP(model): KiKx = np.dot(self.Ki,Kx) if full_cov: Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) - var = Kxx - np.dot(KiKx.T,Kx) + var = Kxx - np.dot(KiKx.T,Kx) #NOTE is the shape of v right? else: Kxx = self.kern.Kdiag(_Xnew, slices=slices) var = Kxx - np.sum(np.multiply(KiKx,Kx),0) - return mu, var + return mu, var[:,None] def predict(self,Xnew, slices=None, full_cov=False): @@ -170,26 +177,11 @@ class GP(model): return mean, _5pc, _95pc - def raw_plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): + + def _x_frame(self,plot_limits=None,which_data='all',which_functions='all',resolution=None): """ - Plot the GP's view of the world, where the data is normalised and the likelihood is Gaussian - - :param samples: the number of a posteriori samples to plot - :param which_data: which if the training data to plot (default all) - :type which_data: 'all' or a slice object to slice self.X, self.Y - :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits - :param which_functions: which of the kernel functions to plot (additively) - :type which_functions: list of bools - :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D - - Plot the posterior of the GP. - - In one dimension, the function is plotted with a shaded region identifying two standard deviations. - - In two dimsensions, a contour-plot shows the mean predicted function - - In higher dimensions, we've no implemented this yet !TODO! - - Can plot only part of the data and part of the posterior functions using which_data and which_functions + Internal helper function for making plots, return a set of new input values to plot as well as lower and upper limits """ - if which_functions=='all': which_functions = [True]*self.kern.Nparts if which_data=='all': @@ -208,28 +200,47 @@ class GP(model): if self.X.shape[1]==1: Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - m,v = self._raw_predict(Xnew,slices=which_functions,full_cov=False) - lower, upper = m.flatten() - 2.*np.sqrt(v) , m.flatten()+ 2.*np.sqrt(v) - gpplot(Xnew,m,lower,upper) - pb.plot(X,Y,'kx',mew=1.5) - pb.xlim(xmin,xmax) elif self.X.shape[1]==2: resolution = resolution or 50 xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] - Xtest = np.vstack((xx.flatten(),yy.flatten())).T - zz,vv = self._raw_predict(Xtest,slices=which_functions,full_cov=False) - zz = zz.reshape(resolution,resolution) - pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) - pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) - pb.xlim(xmin[0],xmax[0]) - pb.ylim(xmin[1],xmax[1]) - + Xnew = np.vstack((xx.flatten(),yy.flatten())).T else: raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" + return Xnew, xmin, xmax - def plot(self): + def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): + """ + Plot the GP's view of the world, where the data is normalised and the likelihood is Gaussian + + :param samples: the number of a posteriori samples to plot + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :param which_functions: which of the kernel functions to plot (additively) + :type which_functions: list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions using which_data and which_functions + """ """ Plot the data's view of the world, with non-normalised values and GP predictions passed through the likelihood """ - pass# TODO!!!!! + Xnew, xmin, xmax = self._x_frame() + m,v = self._raw_predict(Xnew) + if isinstance(self.likelihood,EP): + pb.subplot(211) + gpplot(Xnew,m,m-np.sqrt(v),m+np.sqrt(v)) + pb.plot(self.X,self.likelihood.Y,'kx',mew=1.5) + pb.xlim(xmin,xmax) + if isinstance(self.likelihood,EP): + pb.subplot(212) + phi_m,phi_l,phi_u = self.likelihood.predictive_values(m,v) + gpplot(Xnew,phi_m,phi_l,phi_u) + pb.plot(self.X,self.likelihood.data,'kx',mew=1.5) + pb.xlim(xmin,xmax) diff --git a/GPy/util/plot.py b/GPy/util/plot.py index 3b4682e4..bf372869 100644 --- a/GPy/util/plot.py +++ b/GPy/util/plot.py @@ -11,6 +11,8 @@ def gpplot(x,mu,lower,upper,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.c axes = pb.gca() mu = mu.flatten() x = x.flatten() + lower = lower.flatten() + upper = upper.flatten() #here's the mean axes.plot(x,mu,color=edgecol,linewidth=2) From eb04cbed634712aeef55b40fb29ae283bfa4e480 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Fri, 1 Feb 2013 13:32:13 +0000 Subject: [PATCH 32/44] merged changes in likelihood_functions (James) --- GPy/examples/ep_fix.py | 4 +- GPy/likelihoods/likelihood_functions.py | 69 +------------------------ GPy/models/GP.py | 7 +-- 3 files changed, 5 insertions(+), 75 deletions(-) diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index 83a58bf8..3cb35663 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -36,8 +36,8 @@ m.update_likelihood_approximation() #m.checkgrad(verbose=1) m.optimize() print "Round 2" -m.update_likelihood_approximation() +#rm.update_likelihood_approximation() #m.EPEM() -#m.plot() +m.plot() #print(m) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 5e2f0b85..39428c70 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -20,11 +20,7 @@ class likelihood_function: self.location = location self.scale = scale -<<<<<<< HEAD -class Probit(likelihood): -======= class probit(likelihood_function): ->>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3 """ Probit likelihood Y is expected to take values in {-1,1} @@ -33,11 +29,6 @@ class probit(likelihood_function): L(x) = \\Phi (Y_i*f_i) $$ """ -<<<<<<< HEAD -======= - def __init__(self,location=0,scale=1): - likelihood_function.__init__(self,Y,location,scale) ->>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3 def moments_match(self,data_i,tau_i,v_i): """ @@ -66,11 +57,7 @@ class probit(likelihood_function): p_95 = np.ones([mu.size]) return mean, p_05, p_95 -<<<<<<< HEAD -class Poisson(likelihood): -======= -class poisson(likelihood_function): ->>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3 +class Poisson(likelihood_function): """ Poisson likelihood Y is expected to take values in {0,1,2,...} @@ -79,13 +66,6 @@ class poisson(likelihood_function): L(x) = \exp(\lambda) * \lambda**Y_i / Y_i! $$ """ -<<<<<<< HEAD -======= - def __init__(self,Y,location=0,scale=1): - assert len(Y[Y<0]) == 0, "Output cannot have negative values" - likelihood_function.__init__(self,Y,location,scale) ->>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3 - def moments_match(self,i,tau_i,v_i): """ Moments match of the marginal approximation in EP algorithm @@ -148,54 +128,7 @@ class poisson(likelihood_function): Compute mean, and conficence interval (percentiles 5 and 95) of the prediction """ mean = np.exp(mu*self.scale + self.location) -<<<<<<< HEAD tmp = stats.poisson.ppf(np.array([.05,.95]),mu) p_05 = tmp[:,0] p_95 = tmp[:,1] return mean,p_05,p_95 -======= - if all: - tmp = stats.poisson.ppf(np.array([.05,.95]),mu) - p_05 = tmp[:,0] - p_95 = tmp[:,1] - return mean,mean,p_05,p_95 - else: - return mean - - def _log_likelihood_gradients(): - raise NotImplementedError - - def plot(self,X,mu,var,phi,X_obs,Z=None,samples=0): - assert X_obs.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X,phi,phi.flatten()) - pb.plot(X_obs,self.Y,'kx',mew=1.5) - if samples: - phi_samples = np.vstack([np.random.poisson(phi.flatten(),phi.size) for s in range(samples)]) - pb.plot(X,phi_samples.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - if Z is not None: - pb.plot(Z,Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) - -class gaussian(likelihood_function): - """ - Gaussian likelihood - Y is expected to take values in (-inf,inf) - """ - def moments_match(self,i,tau_i,v_i): - """ - Moments match of the marginal approximation in EP algorithm - - :param i: number of observation (int) - :param tau_i: precision of the cavity distribution (float) - :param v_i: mean/variance of the cavity distribution (float) - """ - mu = v_i/tau_i - sigma = np.sqrt(1./tau_i) - s = 1. if self.Y[i] == 0 else 1./self.Y[i] - sigma2_hat = 1./(1./sigma**2 + 1./s**2) - mu_hat = sigma2_hat*(mu/sigma**2 + self.Y[i]/s**2) - Z_hat = 1./np.sqrt(2*np.pi) * 1./np.sqrt(sigma**2+s**2) * np.exp(-.5*(mu-self.Y[i])**2/(sigma**2 + s**2)) - return Z_hat, mu_hat, sigma2_hat - - def _log_likelihood_gradients(): - raise NotImplementedError ->>>>>>> 346f9dd8bd3207959b87ded258e55aeb094f1ea3 diff --git a/GPy/models/GP.py b/GPy/models/GP.py index 49c22364..ae192618 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -90,11 +90,8 @@ class GP(model): For a Gaussian (or direct: TODO) likelihood, no iteration is required: this function does nothing """ - self.likelihood.fit_full(self.K) - # Recompute K + noise_term - self.K = self.kern.K(self.X,slices1=self.Xslices) - self.K += self.likelihood.variance - self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + self.likelihood.fit_full(self.kern.compute(self.X)) + self._set_params(self._get_params()) # update the GP def _model_fit_term(self): """ From f941d629e68ac1611619d3455cc8c628ce59035e Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Fri, 1 Feb 2013 13:45:55 +0000 Subject: [PATCH 33/44] James' debugging of the EP/GP interface It seems that the GP-EP algorithm works now. --- GPy/examples/ep_fix.py | 4 ++-- GPy/likelihoods/EP.py | 11 +++++++---- GPy/models/GP.py | 4 ++-- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index 3cb35663..d1747025 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -4,7 +4,7 @@ """ Simple Gaussian Processes classification 1D -Probit likelihood +probit likelihood """ import pylab as pb import numpy as np @@ -26,7 +26,7 @@ Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None] kernel = GPy.kern.rbf(1) # Define likelihood -distribution = GPy.likelihoods.likelihood_functions.Probit() +distribution = GPy.likelihoods.likelihood_functions.probit() likelihood_object = GPy.likelihoods.EP(Y,distribution) # Model definition diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index 420b138a..a88059b1 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -27,7 +27,7 @@ class EP(likelihood): #initial values for the GP variables self.Y = np.zeros((self.N,1)) - self.variance = np.zeros((self.N,self.N))#np.eye(self.N) + self.covariance_matrix = np.eye(self.N) self.Z = 0 self.YYT = None @@ -50,8 +50,9 @@ class EP(likelihood): mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep - self.Y = mu_tilde[:,None] - self.precsion = self.tau_tilde[:,None] + self.Y = mu_tilde[:,None] + self.YYT = np.dot(self.Y,self.Y.T) + self.precision = self.tau_tilde self.covariance_matrix = np.diag(1./self.precision) def fit_full(self,K): @@ -61,9 +62,11 @@ class EP(likelihood): """ #Prior distribution parameters: p(f|X) = N(f|0,K) + self.tau_tilde = np.zeros(self.N) + self.v_tilde = np.zeros(self.N) #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) self.mu = np.zeros(self.N) - self.Sigma = K.copy() - self.variance.copy() + self.Sigma = K.copy() """ Initial values - Cavity distribution parameters: diff --git a/GPy/models/GP.py b/GPy/models/GP.py index ae192618..e64da2c9 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -65,7 +65,7 @@ class GP(model): self.likelihood._set_params(p[self.kern.Nparam:]) self.K = self.kern.K(self.X,slices1=self.Xslices) - self.K += self.likelihood.variance + self.K += self.likelihood.covariance_matrix self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) @@ -90,7 +90,7 @@ class GP(model): For a Gaussian (or direct: TODO) likelihood, no iteration is required: this function does nothing """ - self.likelihood.fit_full(self.kern.compute(self.X)) + self.likelihood.fit_full(self.kern.K(self.X)) self._set_params(self._get_params()) # update the GP def _model_fit_term(self): From c025e8b68b482048e29166922043b452d41328bb Mon Sep 17 00:00:00 2001 From: James Hensman Date: Fri, 1 Feb 2013 13:55:19 +0000 Subject: [PATCH 34/44] beginning of work to make sparse GP ork with RA's EP methods --- GPy/likelihoods/EP.py | 2 +- GPy/models/sparse_GP.py | 15 +++++++++++---- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index ff612a6d..10b8828c 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -31,7 +31,7 @@ class EP(likelihood): self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep self.Y = mu_tilde[:,None] - self.precsion = self.tau_tilde + self.precision = self.tau_tilde[:,None] self.covariance_matrix = np.diag(1./self.precision) def fit_full(self,K): diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index fe7bcc3b..5048a174 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -70,16 +70,23 @@ class sparse_GP(GP): self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) - self.psi2_beta_scaled = (self.psi2*(self.beta/sf2)).sum(0) + if self.likelihood.is_heteroscedastic: + self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.reshape(self.N,1,1)/sf2)).sum(0) + #TODO: what is the likelihood is heterscedatic and there are multiple independent outputs? + else: + self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0) else: self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() self.psi1 = self.kern.K(self.Z,self.X) - tmp = self.psi1*(np.sqrt(self.likelihood.beta)/sf) + if self.likelihood.is_heteroscedastic: + tmp = self.psi1*(np.sqrt(self.likelihood.precision.reshape(self.N,1))/sf) + else: + tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf) self.psi2_beta_scaled = np.dot(tmp,tmp.T) - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)#+np.eye(self.M)*1e-3) + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - self.V = (self.likelihood.beta/self.scale_factor)*self.Y + self.V = (self.likelihood.precision/self.scale_factor)*self.Y self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) self.B = np.eye(self.M)/sf2 + self.A From 0a8686d7c0a96928f9eb5b3b773444dcbd08c859 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Fri, 1 Feb 2013 15:14:11 +0000 Subject: [PATCH 35/44] EPEM is running. --- GPy/examples/ep_fix.py | 38 +++++---- GPy/likelihoods/EP.py | 102 ++++++++++++------------ GPy/likelihoods/likelihood_functions.py | 1 - GPy/models/GP.py | 68 ++++++---------- GPy/util/plot.py | 20 +++++ 5 files changed, 117 insertions(+), 112 deletions(-) diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py index d1747025..440d00aa 100644 --- a/GPy/examples/ep_fix.py +++ b/GPy/examples/ep_fix.py @@ -1,7 +1,6 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) - """ Simple Gaussian Processes classification 1D probit likelihood @@ -19,25 +18,36 @@ X1 = np.random.normal(5,2,N/2) X2 = np.random.normal(10,2,N/2) X = np.hstack([X1,X2])[:,None] -# Outputs +# Output Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None] # Kernel object kernel = GPy.kern.rbf(1) -# Define likelihood +# Likelihood object distribution = GPy.likelihoods.likelihood_functions.probit() -likelihood_object = GPy.likelihoods.EP(Y,distribution) +likelihood = GPy.likelihoods.EP(Y,distribution) # Model definition -m = GPy.models.GP(X,kernel,likelihood=likelihood_object) -m.ensure_default_constraints() -m.update_likelihood_approximation() -#m.checkgrad(verbose=1) -m.optimize() -print "Round 2" -#rm.update_likelihood_approximation() +m = GPy.models.GP(X,kernel,likelihood=likelihood) -#m.EPEM() -m.plot() -#print(m) +# Model constraints +m.ensure_default_constraints() + +# Optimize model +""" +EPEM runs a loop that consists of two steps: +1) EP likelihood approximation: + m.update_likelihood_approximation() +2) Parameters optimization: + m.optimize() +""" +m.EPEM() + +# Plot +pb.subplot(211) +m.plot_GP() +pb.subplot(212) +m.plot_output() + +print(m) diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index a88059b1..f01a5017 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -65,8 +65,8 @@ class EP(likelihood): self.tau_tilde = np.zeros(self.N) self.v_tilde = np.zeros(self.N) #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) - self.mu = np.zeros(self.N) - self.Sigma = K.copy() + mu = np.zeros(self.N) + Sigma = K.copy() """ Initial values - Cavity distribution parameters: @@ -94,29 +94,27 @@ class EP(likelihood): update_order = np.random.permutation(self.N) for i in update_order: #Cavity distribution parameters - self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i] - self.v_[i] = self.mu[i]/self.Sigma[i,i] - self.eta*self.v_tilde[i] - print 1./self.Sigma[i,i],self.tau_tilde[i] + self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i] + self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i] #Marginal moments self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],self.tau_[i],self.v_[i]) #Site parameters update - Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma[i,i]) - Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma[i,i]) + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i]) self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau self.v_tilde[i] = self.v_tilde[i] + Delta_v #Posterior distribution parameters update - si=self.Sigma[:,i].reshape(self.N,1) - self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T) - self.mu = np.dot(self.Sigma,self.v_tilde) + si=Sigma[:,i].reshape(self.N,1) + Sigma = Sigma - Delta_tau/(1.+ Delta_tau*Sigma[i,i])*np.dot(si,si.T) + mu = np.dot(Sigma,self.v_tilde) self.iterations += 1 - print self.tau_tilde[i] #Sigma recomptutation with Cholesky decompositon Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K L = jitchol(B) V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1) - self.Sigma = K - np.dot(V.T,V) - self.mu = np.dot(self.Sigma,self.v_tilde) + Sigma = K - np.dot(V.T,V) + mu = np.dot(Sigma,self.v_tilde) epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N self.np1.append(self.tau_tilde.copy()) @@ -139,7 +137,7 @@ class EP(likelihood): """ Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm) KmnKnm = np.dot(Kmn, Kmn.T) - KmmiKmn = np.dot(Kmmi,self.Kmn) + KmmiKmn = np.dot(Kmmi,Kmn) Qnn_diag = np.sum(Kmn*KmmiKmn,-2) LLT0 = Kmm.copy() @@ -221,13 +219,13 @@ class EP(likelihood): q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn """ - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - self.P0 = self.Kmn.T - self.KmnKnm = np.dot(self.P0.T, self.P0) - self.KmmiKmn = np.dot(self.Kmmi,self.P0.T) - self.Qnn_diag = np.sum(self.P0.T*self.KmmiKmn,-2) - self.Diag0 = self.Knn_diag - self.Qnn_diag - self.R0 = jitchol(self.Kmmi).T + Kmmi, self.Lm, self.Lmi, Kmm_logdet = pdinv(Kmm) + P0 = Kmn.T + KmnKnm = np.dot(P0.T, P0) + KmmiKmn = np.dot(Kmmi,P0.T) + Qnn_diag = np.sum(P0.T*KmmiKmn,-2) + Diag0 = Knn_diag - Qnn_diag + R0 = jitchol(Kmmi).T """ Posterior approximation: q(f|y) = N(f| mu, Sigma) @@ -236,11 +234,11 @@ class EP(likelihood): """ self.w = np.zeros(self.N) self.gamma = np.zeros(self.M) - self.mu = np.zeros(self.N) - self.P = self.P0.copy() - self.R = self.R0.copy() - self.Diag = self.Diag0.copy() - self.Sigma_diag = self.Knn_diag + mu = np.zeros(self.N) + P = P0.copy() + R = R0.copy() + Diag = Diag0.copy() + Sigma_diag = Knn_diag """ Initial values - Cavity distribution parameters: @@ -268,41 +266,41 @@ class EP(likelihood): update_order = np.random.permutation(self.N) for i in update_order: #Cavity distribution parameters - self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i] - self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i] + self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i] + self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i] #Marginal moments self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(data[i],self.tau_[i],self.v_[i]) #Site parameters update - Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i]) - Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i]) + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i]) self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau self.v_tilde[i] = self.v_tilde[i] + Delta_v #Posterior distribution parameters update - dtd1 = Delta_tau*self.Diag[i] + 1. - dii = self.Diag[i] - self.Diag[i] = dii - (Delta_tau * dii**2.)/dtd1 - pi_ = self.P[i,:].reshape(1,self.M) - self.P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_ - Rp_i = np.dot(self.R,pi_.T) - RTR = np.dot(self.R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*self.Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),self.R)) - self.R = jitchol(RTR).T + dtd1 = Delta_tau*Diag[i] + 1. + dii = Diag[i] + Diag[i] = dii - (Delta_tau * dii**2.)/dtd1 + pi_ = P[i,:].reshape(1,self.M) + P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_ + Rp_i = np.dot(R,pi_.T) + RTR = np.dot(R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R)) + R = jitchol(RTR).T self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1 - self.gamma = self.gamma + (Delta_v - Delta_tau*self.mu[i])*np.dot(RTR,self.P[i,:].T) - self.RPT = np.dot(self.R,self.P.T) - self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1) - self.mu = self.w + np.dot(self.P,self.gamma) + self.gamma = self.gamma + (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T) + RPT = np.dot(R,P.T) + Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1) + mu = self.w + np.dot(P,self.gamma) self.iterations += 1 #Sigma recomptutation with Cholesky decompositon - self.Diag = self.Diag0/(1.+ self.Diag0 * self.tau_tilde) - self.P = (self.Diag / self.Diag0)[:,None] * self.P0 - self.RPT0 = np.dot(self.R0,self.P0.T) - L = jitchol(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T)) - self.R,info = linalg.flapack.dtrtrs(L,self.R0,lower=1) - self.RPT = np.dot(self.R,self.P.T) - self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1) - self.w = self.Diag * self.v_tilde - self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.v_tilde)) - self.mu = self.w + np.dot(self.P,self.gamma) + Diag = Diag0/(1.+ Diag0 * self.tau_tilde) + P = (Diag / Diag0)[:,None] * P0 + RPT0 = np.dot(R0,P0.T) + L = jitchol(np.eye(self.M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T)) + R,info = linalg.flapack.dtrtrs(L,R0,lower=1) + RPT = np.dot(R,P.T) + Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1) + self.w = Diag * self.v_tilde + self.gamma = np.dot(R.T, np.dot(RPT,self.v_tilde)) + mu = self.w + np.dot(P,self.gamma) epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N self.np1.append(self.tau_tilde.copy()) diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 39428c70..4f571e14 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -7,7 +7,6 @@ from scipy import stats import scipy as sp import pylab as pb from ..util.plot import gpplot -#from . import EP class likelihood_function: """ diff --git a/GPy/models/GP.py b/GPy/models/GP.py index e64da2c9..0c3ea6b7 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -7,7 +7,7 @@ import pylab as pb from .. import kern from ..core import model from ..util.linalg import pdinv,mdot -from ..util.plot import gpplot, Tango +from ..util.plot import gpplot,x_frame, Tango from ..likelihoods import EP class GP(model): @@ -175,37 +175,7 @@ class GP(model): return mean, _5pc, _95pc - def _x_frame(self,plot_limits=None,which_data='all',which_functions='all',resolution=None): - """ - Internal helper function for making plots, return a set of new input values to plot as well as lower and upper limits - """ - if which_functions=='all': - which_functions = [True]*self.kern.Nparts - if which_data=='all': - which_data = slice(None) - - X = self.X[which_data,:] - Y = self.likelihood.Y[which_data,:] - - if plot_limits is None: - xmin,xmax = X.min(0),X.max(0) - xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) - elif len(plot_limits)==2: - xmin, xmax = plot_limits - else: - raise ValueError, "Bad limits for plotting" - - if self.X.shape[1]==1: - Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - elif self.X.shape[1]==2: - resolution = resolution or 50 - xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] - Xnew = np.vstack((xx.flatten(),yy.flatten())).T - else: - raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" - return Xnew, xmin, xmax - - def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): + def plot_GP(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): """ Plot the GP's view of the world, where the data is normalised and the likelihood is Gaussian @@ -223,21 +193,29 @@ class GP(model): - In higher dimensions, we've no implemented this yet !TODO! Can plot only part of the data and part of the posterior functions using which_data and which_functions - """ - """ Plot the data's view of the world, with non-normalised values and GP predictions passed through the likelihood """ - Xnew, xmin, xmax = self._x_frame() - m,v = self._raw_predict(Xnew) - if isinstance(self.likelihood,EP): - pb.subplot(211) + if which_functions=='all': + which_functions = [True]*self.kern.Nparts + if which_data=='all': + which_data = slice(None) + + Xnew, xmin, xmax = x_frame(self.X, plot_limits=plot_limits) + + m,v = self._raw_predict(Xnew, slices=which_functions) gpplot(Xnew,m,m-np.sqrt(v),m+np.sqrt(v)) - pb.plot(self.X,self.likelihood.Y,'kx',mew=1.5) + pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5) pb.xlim(xmin,xmax) - if isinstance(self.likelihood,EP): - pb.subplot(212) - phi_m,phi_l,phi_u = self.likelihood.predictive_values(m,v) - gpplot(Xnew,phi_m,phi_l,phi_u) - pb.plot(self.X,self.likelihood.data,'kx',mew=1.5) - pb.xlim(xmin,xmax) + def plot_output(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): + if which_functions=='all': + which_functions = [True]*self.kern.Nparts + if which_data=='all': + which_data = slice(None) + Xnew, xmin, xmax = x_frame(self.X, plot_limits=plot_limits) + m, lower, upper = self.predict(Xnew, slices=which_functions) + gpplot(Xnew,m, lower, upper) + pb.plot(self.X[which_data],self.likelihood.data[which_data],'kx',mew=1.5) + ymin,ymax = self.likelihood.data.min()*1.2,self.likelihood.data.max()*1.2 + pb.xlim(xmin,xmax) + pb.ylim(ymin,ymax) diff --git a/GPy/util/plot.py b/GPy/util/plot.py index bf372869..60e3e488 100644 --- a/GPy/util/plot.py +++ b/GPy/util/plot.py @@ -70,4 +70,24 @@ def align_subplots(N,M,xlim=None, ylim=None): else: removeUpperTicks() +def x_frame(X,plot_limits=None,resolution=None): + """ + Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits + """ + if plot_limits is None: + xmin,xmax = X.min(0),X.max(0) + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + elif len(plot_limits)==2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + if X.shape[1]==1: + Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] + elif X.shape[1]==2: + resolution = resolution or 50 + xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] + Xnew = np.vstack((xx.flatten(),yy.flatten())).T + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + return Xnew, xmin, xmax From 24b6dfa086ad1d992e7a2171c0886a64b28bba62 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Fri, 1 Feb 2013 16:21:26 +0000 Subject: [PATCH 36/44] Classification examples corrected (2/3) --- GPy/examples/classification.py | 78 ++++++++++++++++++++-------------- GPy/examples/ep_fix.py | 53 ----------------------- GPy/testing/unit_tests.py | 19 ++++----- 3 files changed, 54 insertions(+), 96 deletions(-) delete mode 100644 GPy/examples/ep_fix.py diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index fb14139d..7645964d 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -3,16 +3,15 @@ """ -Simple Gaussian Processes classification +Gaussian Processes classification """ import pylab as pb import numpy as np import GPy default_seed=10000 -###################################### -## 2 dimensional example -def crescent_data(model_type='Full', inducing=10, seed=default_seed): + +def crescent_data(model_type='Full', inducing=10, seed=default_seed): #FIXME """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. @@ -30,7 +29,7 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed): # create sparse GP EP model m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type) - m.approximate_likelihood() + m.update_likelihood_approximation() print(m) # optimize @@ -42,53 +41,66 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed): return m def oil(): - """Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.""" + """ + Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. + """ data = GPy.util.datasets.oil() - likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) + # Kernel object + kernel = GPy.kern.rbf(12) - # create simple GP model - m = GPy.models.GP_EP(data['X'],likelihood) + # Likelihood object + distribution = GPy.likelihoods.likelihood_functions.probit() + likelihood = GPy.likelihoods.EP(data['Y'][:, 0:1],distribution) - # contrain all parameters to be positive + # Create GP model + m = GPy.models.GP(data['X'],kernel,likelihood=likelihood) + + # Contrain all parameters to be positive m.constrain_positive('') m.tie_param('lengthscale') - m.approximate_likelihood() + m.update_likelihood_approximation() - # optimize + # Optimize m.optimize() - # plot - #m.plot() print(m) return m -def toy_linear_1d_classification(model_type='Full', inducing=4, seed=default_seed): - """Simple 1D classification example. - :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. +def toy_linear_1d_classification(seed=default_seed): + """ + Simple 1D classification example :param seed : seed value for data generation (default is 4). :type seed: int - :param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). :type inducing: int """ + data = GPy.util.datasets.toy_linear_1d_classification(seed=seed) - likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) - assert model_type in ('Full','DTC','FITC') - # create simple GP model - if model_type=='Full': - m = GPy.models.GP_EP(data['X'],likelihood) - else: - # create sparse GP EP model - m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type) + # Kernel object + kernel = GPy.kern.rbf(1) - m.constrain_positive('var') - m.constrain_positive('len') - m.tie_param('lengthscale') - m.approximate_likelihood() + # Likelihood object + distribution = GPy.likelihoods.likelihood_functions.probit() + likelihood = GPy.likelihoods.EP(data['Y'][:, 0:1],distribution) - # Optimize and plot - m.em(plot_all=False) # EM algorithm - m.plot() + # Model definition + m = GPy.models.GP(data['X'],kernel,likelihood=likelihood) + # Optimize + """ + EPEM runs a loop that consists of two steps: + 1) EP likelihood approximation: + m.update_likelihood_approximation() + 2) Parameters optimization: + m.optimize() + """ + m.EPEM() + + # Plot + pb.subplot(211) + m.plot_GP() + pb.subplot(212) + m.plot_output() print(m) + return m diff --git a/GPy/examples/ep_fix.py b/GPy/examples/ep_fix.py deleted file mode 100644 index 440d00aa..00000000 --- a/GPy/examples/ep_fix.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -""" -Simple Gaussian Processes classification 1D -probit likelihood -""" -import pylab as pb -import numpy as np -import GPy -pb.ion() - -pb.close('all') - -# Inputs -N = 30 -X1 = np.random.normal(5,2,N/2) -X2 = np.random.normal(10,2,N/2) -X = np.hstack([X1,X2])[:,None] - -# Output -Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None] - -# Kernel object -kernel = GPy.kern.rbf(1) - -# Likelihood object -distribution = GPy.likelihoods.likelihood_functions.probit() -likelihood = GPy.likelihoods.EP(Y,distribution) - -# Model definition -m = GPy.models.GP(X,kernel,likelihood=likelihood) - -# Model constraints -m.ensure_default_constraints() - -# Optimize model -""" -EPEM runs a loop that consists of two steps: -1) EP likelihood approximation: - m.update_likelihood_approximation() -2) Parameters optimization: - m.optimize() -""" -m.EPEM() - -# Plot -pb.subplot(211) -m.plot_GP() -pb.subplot(212) -m.plot_output() - -print(m) diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py index a302b25f..4cc1c4ab 100644 --- a/GPy/testing/unit_tests.py +++ b/GPy/testing/unit_tests.py @@ -154,17 +154,16 @@ class GradientTests(unittest.TestCase): m.constrain_positive('(linear|bias|white)') self.assertTrue(m.checkgrad()) - def test_GP_EP(self): - return # Disabled TODO + def test_GP_EP_probit(self): N = 20 - X = np.hstack([np.random.rand(N/2)+1,np.random.rand(N/2)-1])[:,None] - k = GPy.kern.rbf(1) + GPy.kern.white(1) - Y = np.hstack([np.ones(N/2),-np.ones(N/2)])[:,None] - likelihood = GPy.inference.likelihoods.probit(Y) - m = GPy.models.GP_EP(X,likelihood,k) - m.constrain_positive('(var|len)') - m.approximate_likelihood() - self.assertTrue(m.checkgrad()) + X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None] + Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None] + kernel = GPy.kern.rbf(1) + distribution = GPy.likelihoods.likelihood_functions.probit() + likelihood = GPy.likelihoods.EP(Y,distribution) + m = GPy.models.GP(X,kernel,likelihood=likelihood) + m.ensure_default_constraints() + self.assertTrue(m.EPEM) @unittest.skip("FITC will be broken for a while") def test_generalized_FITC(self): From 5447d6fbfc48c30299bfb41bf3419479d589e37c Mon Sep 17 00:00:00 2001 From: James Hensman Date: Fri, 1 Feb 2013 17:12:45 +0000 Subject: [PATCH 37/44] Assorted work on combining the EP and sparse methods --- GPy/likelihoods/EP.py | 2 + GPy/likelihoods/Gaussian.py | 5 +- GPy/models/GP.py | 4 +- GPy/models/GP_regression.py | 2 +- GPy/models/__init__.py | 9 ++-- GPy/models/sparse_GP.py | 73 ++++++++++++++++-------------- GPy/models/sparse_GP_regression.py | 44 ++++++++++++++++++ 7 files changed, 95 insertions(+), 44 deletions(-) create mode 100644 GPy/models/sparse_GP_regression.py diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py index 1148ff4c..3b76a737 100644 --- a/GPy/likelihoods/EP.py +++ b/GPy/likelihoods/EP.py @@ -19,6 +19,7 @@ class EP(likelihood): self.data = data self.N = self.data.size self.is_heteroscedastic = True + self.Nparams = 0 #Initial values - Likelihood approximation parameters: #p(y|f) = t(f|tau_tilde,v_tilde) @@ -28,6 +29,7 @@ class EP(likelihood): #initial values for the GP variables self.Y = np.zeros((self.N,1)) self.covariance_matrix = np.eye(self.N) + self.precision = np.ones(self.N) self.Z = 0 self.YYT = None diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index 8c1c93f5..94cb0560 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -4,6 +4,7 @@ from likelihood import likelihood class Gaussian(likelihood): def __init__(self,data,variance=1.,normalize=False): self.is_heteroscedastic = False + self.Nparams = 1 self.data = data self.N,D = data.shape self.Z = 0. # a correction factor which accounts for the approximation made @@ -18,7 +19,9 @@ class Gaussian(likelihood): self._std = np.ones((1,D)) self.Y = self.data + #TODO: make this work efficiently (only compute YYT if D>>N) self.YYT = np.dot(self.Y,self.Y.T) + self.trYYT = np.trace(self.YYT) self._set_params(np.asarray(variance)) @@ -50,4 +53,4 @@ class Gaussian(likelihood): pass def _gradients(self,partial): - return np.sum(np.diag(partial)) + return np.sum(partial) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index e64da2c9..d30a31e0 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -31,7 +31,7 @@ class GP(model): """ - def __init__(self, X, kernel, likelihood, normalize_X=False, Xslices=None): + def __init__(self, X, likelihood, kernel, normalize_X=False, Xslices=None): # parse arguments self.Xslices = Xslices @@ -121,7 +121,7 @@ class GP(model): For the likelihood parameters, pass in alpha = K^-1 y """ - return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK,X=self.X), self.likelihood._gradients(partial=self.dL_dK))) + return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK,X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) def _raw_predict(self,_Xnew,slices=None, full_cov=False): """ diff --git a/GPy/models/GP_regression.py b/GPy/models/GP_regression.py index 916e5284..5f9f9f3e 100644 --- a/GPy/models/GP_regression.py +++ b/GPy/models/GP_regression.py @@ -33,4 +33,4 @@ class GP_regression(GP): likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y) - GP.__init__(self, X, kernel, likelihood, normalize_X=normalize_X, Xslices=Xslices) + GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices) diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index 1175eb71..2269610d 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -2,14 +2,13 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) -#from sparse_GP_regression import sparse_GP_regression -# TODO ^^ remove these? +from GP import GP +from GP_regression import GP_regression +from sparse_GP import sparse_GP +from sparse_GP_regression import sparse_GP_regression from GPLVM import GPLVM from warped_GP import warpedGP # TODO: from generalized_FITC import generalized_FITC #from sparse_GPLVM import sparse_GPLVM #from uncollapsed_sparse_GP import uncollapsed_sparse_GP -from GP import GP -from GP_regression import GP_regression -#from sparse_GP import sparse_GP #from BGPLVM import Bayesian_GPLVM diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index 5048a174..7252f085 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -6,7 +6,6 @@ import pylab as pb from ..util.linalg import mdot, jitchol, chol_inv, pdinv from ..util.plot import gpplot from .. import kern -from ..inference.likelihoods import likelihood from GP import GP #Still TODO: @@ -34,16 +33,15 @@ class sparse_GP(GP): :type normalize_(X|Y): bool """ - def __init__(self,X,likelihood,kernel, X_uncertainty=None, Z=None,Zslices=None,M=10,normalize_X=False): - self.scale_factor = 1000.0# a scaling factor to help keep the algorithm stable + def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False): + self.scale_factor = 1.0# a scaling factor to help keep the algorithm stable + + self.Z = Z + self.Zslices = Zslices + self.Xslices = Xslices + self.M = Z.shape[0] + self.likelihood = likelihood - if Z is None: - self.Z = np.random.permutation(X.copy())[:M] - self.M = M - else: - assert Z.shape[1]==X.shape[1] - self.Z = Z - self.M = Z.shape[0] if X_uncertainty is None: self.has_uncertain_inputs=False else: @@ -51,7 +49,7 @@ class sparse_GP(GP): self.has_uncertain_inputs=True self.X_uncertainty = X_uncertainty - GP.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices) + GP.__init__(self, X, likelihood, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices) #normalise X uncertainty also if self.has_uncertain_inputs: @@ -67,7 +65,7 @@ class sparse_GP(GP): # kernel computations, using BGPLVM notation self.Kmm = self.kern.K(self.Z) if self.has_uncertain_inputs: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncerTainty) self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) if self.likelihood.is_heteroscedastic: @@ -76,17 +74,18 @@ class sparse_GP(GP): else: self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0) else: - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices) self.psi1 = self.kern.K(self.Z,self.X) if self.likelihood.is_heteroscedastic: tmp = self.psi1*(np.sqrt(self.likelihood.precision.reshape(self.N,1))/sf) else: tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf) self.psi2_beta_scaled = np.dot(tmp,tmp.T) + self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:] # TODO: remove me for efficiency and stability self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) - self.V = (self.likelihood.precision/self.scale_factor)*self.Y + self.V = (self.likelihood.precision/self.scale_factor)*self.likelihood.Y self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) self.B = np.eye(self.M)/sf2 + self.A @@ -97,27 +96,38 @@ class sparse_GP(GP): self.C = mdot(self.Lmi.T, self.Bi, self.Lmi) self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T) - # Compute dL_dpsi # FIXME - self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + # Compute dL_dpsi # FIXME: this is untested for the het. case + self.dL_dpsi0 = - 0.5 * self.D * self.likelihood.precision * np.ones(self.N) self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T - self.dL_dpsi2 = 0.5 * self.beta * self.D * self.Kmmi[None,:,:] # dB - self.dL_dpsi2 += - 0.5 * self.beta/sf2 * self.D * self.C[None,:,:] # dC - self.dL_dpsi2 += - 0.5 * self.beta * self.E[None,:,:] # dD + if self.likelihood.is_heteroscedastic: + self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB + self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]/sf2 * self.D * self.C[None,:,:] # dC + self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]* self.E[None,:,:] # dD + else: + self.dL_dpsi2 = 0.5 * self.likelihood.precision * self.D * self.Kmmi[None,:,:] # dB + self.dL_dpsi2 += - 0.5 * self.likelihood.precision/sf2 * self.D * self.C[None,:,:] # dC + self.dL_dpsi2 += - 0.5 * self.likelihood.precision * self.E[None,:,:] # dD # Compute dL_dKmm self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD + #the partial derivative vector for the likelihood + self.partial_for_likelihood = - 0.5 * self.D*self.likelihood.precision + 0.5 * (self.likelihood.Y**2).sum(1)*self.likelihood.precision**2 #dA + self.partial_for_likelihood += 0.5 * self.D * (self.psi0*self.likelihood.precision**2 - (self.psi2*self.Kmmi[None,:,:]*self.likelihood.precision[:,None,None]**2).sum(1).sum(1)/sf2) #dB + #self.partial_for_likelihood += 0.5 * self.D * np.sum(self.Bi*self.A)*self.likelihood.precision #dC + #self.partial_for_likelihood += -np.diag(np.dot((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) , self.psi1VVpsi1 ))*self.likelihood.precision #dD + def _set_params(self, p): self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) - self.beta = p[self.M*self.Q] # FIXME - self.kern._set_params(p[self.Z.size + 1:]) + self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam]) + self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:]) self._computations() def _get_params(self): - return np.hstack([self.Z.flatten(),GP._get_params(self)) + return np.hstack([self.Z.flatten(),GP._get_params(self)]) def _get_param_names(self): return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + GP._get_param_names(self) @@ -125,24 +135,17 @@ class sparse_GP(GP): def log_likelihood(self): """ Compute the (lower bound on the) log marginal likelihood """ sf2 = self.scale_factor**2 - A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) -0.5*self.beta*self.trYYT # FIXME - B = -0.5*self.D*(self.beta*self.psi0-np.trace(self.A)*sf2)# FIXME + if self.likelihood.is_heteroscedastic: + A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y) + else: + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) -0.5*self.likelihood.precision*self.likelihood.trYYT + B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2) C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2)) D = +0.5*np.sum(self.psi1VVpsi1 * self.C) return A+B+C+D def _log_likelihood_gradients(self): - return np.hstack([self.dL_dZ().flatten(), GP._log_likelihood_gradients(self)]) - - # FIXME: move this into the lieklihood class - def dL_dbeta(self): - sf2 = self.scale_factor**2 - dA_dbeta = 0.5 * self.N*self.D/self.beta - 0.5 * self.trYYT - dB_dbeta = - 0.5 * self.D * (self.psi0 - np.trace(self.A)/self.beta*sf2) - dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta - dD_dbeta = np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/self.beta - - return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta) + return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood))) def dL_dtheta(self): """ diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py new file mode 100644 index 00000000..178c8023 --- /dev/null +++ b/GPy/models/sparse_GP_regression.py @@ -0,0 +1,44 @@ +# Copyright (c) 2012, James Hensman +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from sparse_GP import sparse_GP +from .. import likelihoods +from .. import kern + +class sparse_GP_regression(sparse_GP): + """ + Gaussian Process model for regression + + This is a thin wrapper around the GP class, with a set of sensible defalts + + :param X: input observations + :param Y: observed values + :param kernel: a GPy kernel, defaults to rbf+white + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) + :rtype: model object + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,Z=None, M=10): + #kern defaults to rbf + if kernel is None: + kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3) + + #Z defaults to a subset of the data + if Z is None: + Z = np.random.permutation(X.copy())[:M] + else: + assert Z.shape[1]==X.shape[1] + + #likelihood defaults to Gaussian + likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y) + + sparse_GP.__init__(self, X, likelihood, kernel, Z, normalize_X=normalize_X, Xslices=Xslices) From 2b756e96e1902168138a864e8a8048c306cd93c8 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Fri, 1 Feb 2013 17:42:51 +0000 Subject: [PATCH 38/44] made the BGPLVM work in the new world order --- GPy/models/BGPLVM.py | 28 +++++++++++++++++++--------- GPy/models/__init__.py | 4 ++-- GPy/models/sparse_GP.py | 2 +- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/GPy/models/BGPLVM.py b/GPy/models/BGPLVM.py index db147944..ffa7df54 100644 --- a/GPy/models/BGPLVM.py +++ b/GPy/models/BGPLVM.py @@ -5,10 +5,12 @@ import numpy as np import pylab as pb import sys, pdb from GPLVM import GPLVM -from sparse_GP_regression import sparse_GP_regression +from sparse_GP import sparse_GP from GPy.util.linalg import pdinv +from ..likelihoods import Gaussian +from .. import kern -class Bayesian_GPLVM(sparse_GP_regression, GPLVM): +class Bayesian_GPLVM(sparse_GP, GPLVM): """ Bayesian Gaussian Process Latent Variable Model @@ -20,15 +22,23 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM): :type init: 'PCA'|'random' """ - def __init__(self, Y, Q, init='PCA', **kwargs): + def __init__(self, Y, Q, init='PCA', M=10, Z=None, **kwargs): X = self.initialise_latent(init, Q, Y) - S = np.ones_like(X) * 1e-2# - sparse_GP_regression.__init__(self, X, Y, X_uncertainty = S, **kwargs) + + if Z is None: + Z = np.random.permutation(X.copy())[:M] + else: + assert Z.shape[1]==X.shape[1] + + kernel = kern.rbf(Q) + kern.white(Q) + + S = np.ones_like(X) * 1e-2# + sparse_GP.__init__(self, X, Gaussian(Y), X_uncertainty = S, Z=Z,**kwargs) def _get_param_names(self): X_names = sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[]) S_names = sum([['S_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[]) - return (X_names + S_names + sparse_GP_regression._get_param_names(self)) + return (X_names + S_names + sparse_GP._get_param_names(self)) def _get_params(self): """ @@ -40,13 +50,13 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM): =============================================================== """ - return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP_regression._get_params(self))) + return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP._get_params(self))) def _set_params(self,x): N, Q = self.N, self.Q self.X = x[:self.X.size].reshape(N,Q).copy() self.X_uncertainty = x[(N*Q):(2*N*Q)].reshape(N,Q).copy() - sparse_GP_regression._set_params(self, x[(2*N*Q):]) + sparse_GP._set_params(self, x[(2*N*Q):]) def dL_dmuS(self): dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1,self.Z,self.X,self.X_uncertainty) @@ -58,5 +68,5 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM): return np.hstack((dL_dmu.flatten(), dL_dS.flatten())) def _log_likelihood_gradients(self): - return np.hstack((self.dL_dmuS().flatten(), sparse_GP_regression._log_likelihood_gradients(self))) + return np.hstack((self.dL_dmuS().flatten(), sparse_GP._log_likelihood_gradients(self))) diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index 2269610d..9cc8fa68 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -9,6 +9,6 @@ from sparse_GP_regression import sparse_GP_regression from GPLVM import GPLVM from warped_GP import warpedGP # TODO: from generalized_FITC import generalized_FITC -#from sparse_GPLVM import sparse_GPLVM +from sparse_GPLVM import sparse_GPLVM #from uncollapsed_sparse_GP import uncollapsed_sparse_GP -#from BGPLVM import Bayesian_GPLVM +from BGPLVM import Bayesian_GPLVM diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index 7252f085..f35b3918 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -65,7 +65,7 @@ class sparse_GP(GP): # kernel computations, using BGPLVM notation self.Kmm = self.kern.K(self.Z) if self.has_uncertain_inputs: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncerTainty) + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty) self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) if self.likelihood.is_heteroscedastic: From 5e2baf191954521fc84280d2ed363e32be9548c2 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Fri, 1 Feb 2013 17:58:21 +0000 Subject: [PATCH 39/44] Changes in plotting functions. --- GPy/examples/classification.py | 6 +-- GPy/likelihoods/Gaussian.py | 2 +- GPy/likelihoods/likelihood_functions.py | 4 +- GPy/models/GP.py | 59 ++++++++++++++++++------- GPy/util/plot.py | 31 +++++++++---- 5 files changed, 71 insertions(+), 31 deletions(-) diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index 7645964d..c25ea124 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -84,7 +84,7 @@ def toy_linear_1d_classification(seed=default_seed): likelihood = GPy.likelihoods.EP(data['Y'][:, 0:1],distribution) # Model definition - m = GPy.models.GP(data['X'],kernel,likelihood=likelihood) + m = GPy.models.GP(data['X'],likelihood=likelihood,kernel=kernel) # Optimize """ @@ -98,9 +98,9 @@ def toy_linear_1d_classification(seed=default_seed): # Plot pb.subplot(211) - m.plot_GP() + m.plot_internal() pb.subplot(212) - m.plot_output() + m.plot() print(m) return m diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index 94cb0560..ff358b2d 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -42,7 +42,7 @@ class Gaussian(likelihood): """ mean = mu*self._std + self._mean true_var = (var + self._variance)*self._std**2 - _5pc = mean + mean - 2.*np.sqrt(var) + _5pc = mean + - 2.*np.sqrt(var) _95pc = mean + 2.*np.sqrt(var) return mean, _5pc, _95pc diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py index 4f571e14..de97824a 100644 --- a/GPy/likelihoods/likelihood_functions.py +++ b/GPy/likelihoods/likelihood_functions.py @@ -52,8 +52,8 @@ class probit(likelihood_function): mu = mu.flatten() var = var.flatten() mean = stats.norm.cdf(mu/np.sqrt(1+var)) - p_05 = np.zeros([mu.size]) - p_95 = np.ones([mu.size]) + p_05 = np.zeros(mu.shape)#np.zeros([mu.size]) + p_95 = np.zeros(mu.shape)#np.ones([mu.size]) return mean, p_05, p_95 class Poisson(likelihood_function): diff --git a/GPy/models/GP.py b/GPy/models/GP.py index db00755c..c640e529 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -7,7 +7,7 @@ import pylab as pb from .. import kern from ..core import model from ..util.linalg import pdinv,mdot -from ..util.plot import gpplot,x_frame, Tango +from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango from ..likelihoods import EP class GP(model): @@ -175,7 +175,7 @@ class GP(model): return mean, _5pc, _95pc - def plot_GP(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): + def plot_internal(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): """ Plot the GP's view of the world, where the data is normalised and the likelihood is Gaussian @@ -200,22 +200,49 @@ class GP(model): if which_data=='all': which_data = slice(None) - Xnew, xmin, xmax = x_frame(self.X, plot_limits=plot_limits) + if self.X.shape[1] == 1: + Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits) + m,v = self._raw_predict(Xnew, slices=which_functions) + gpplot(Xnew,m,m-np.sqrt(v),m+np.sqrt(v)) + pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5) + pb.xlim(xmin,xmax) + elif X.shape[1]==2: + resolution = resolution or 50 + Xnew, xmin, xmax,xx,yy = x_frame2D(self.X, plot_limits=plot_limits) + m,v = self._raw_predict(Xnew, slices=which_functions) + m = m.reshape(resolution,resolution) + pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) + pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) + pb.xlim(xmin[0],xmax[0]) + pb.ylim(xmin[1],xmax[1]) + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" - m,v = self._raw_predict(Xnew, slices=which_functions) - gpplot(Xnew,m,m-np.sqrt(v),m+np.sqrt(v)) - pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5) - pb.xlim(xmin,xmax) - - def plot_output(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): + def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): if which_functions=='all': which_functions = [True]*self.kern.Nparts if which_data=='all': which_data = slice(None) - Xnew, xmin, xmax = x_frame(self.X, plot_limits=plot_limits) - m, lower, upper = self.predict(Xnew, slices=which_functions) - gpplot(Xnew,m, lower, upper) - pb.plot(self.X[which_data],self.likelihood.data[which_data],'kx',mew=1.5) - ymin,ymax = self.likelihood.data.min()*1.2,self.likelihood.data.max()*1.2 - pb.xlim(xmin,xmax) - pb.ylim(ymin,ymax) + + if self.X.shape[1] == 1: + Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits) + m, lower, upper = self.predict(Xnew, slices=which_functions) + gpplot(Xnew,m, lower, upper) + pb.plot(self.X[which_data],self.likelihood.data[which_data],'kx',mew=1.5) + ymin,ymax = self.likelihood.data.min()*1.2,self.likelihood.data.max()*1.2 + pb.xlim(xmin,xmax) + pb.ylim(ymin,ymax) + elif X.shape[1]==2: + resolution = resolution or 50 + Xnew, xmin, xmax,xx,yy = x_frame2D(self.X, plot_limits=plot_limits) + m,v = self.predict(Xnew, slices=which_functions) + m = m.reshape(resolution,resolution) + pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) + pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) + pb.xlim(xmin[0],xmax[0]) + pb.ylim(xmin[1],xmax[1]) + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + + + diff --git a/GPy/util/plot.py b/GPy/util/plot.py index 60e3e488..7b346330 100644 --- a/GPy/util/plot.py +++ b/GPy/util/plot.py @@ -70,10 +70,11 @@ def align_subplots(N,M,xlim=None, ylim=None): else: removeUpperTicks() -def x_frame(X,plot_limits=None,resolution=None): +def x_frame1D(X,plot_limits=None,resolution=None): """ Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits """ + assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs" if plot_limits is None: xmin,xmax = X.min(0),X.max(0) xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) @@ -82,12 +83,24 @@ def x_frame(X,plot_limits=None,resolution=None): else: raise ValueError, "Bad limits for plotting" - if X.shape[1]==1: - Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - elif X.shape[1]==2: - resolution = resolution or 50 - xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] - Xnew = np.vstack((xx.flatten(),yy.flatten())).T - else: - raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] return Xnew, xmin, xmax + +def x_frame2D(X,plot_limits=None,resolution=None): + """ + Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits + """ + assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs" + if plot_limits is None: + xmin,xmax = X.min(0),X.max(0) + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + elif len(plot_limits)==2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + + resolution = resolution or 50 + xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] + Xnew = np.vstack((xx.flatten(),yy.flatten())).T + return Xnew, xx,yy,xmin, xmax + From 3a61c39cb86bbc6d247235516420ea26751cb040 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Mon, 4 Feb 2013 10:22:01 +0000 Subject: [PATCH 40/44] partial derivatives for the new likelihood framework --- GPy/models/sparse_GP.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index f35b3918..d3592d69 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -114,10 +114,23 @@ class sparse_GP(GP): self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD #the partial derivative vector for the likelihood - self.partial_for_likelihood = - 0.5 * self.D*self.likelihood.precision + 0.5 * (self.likelihood.Y**2).sum(1)*self.likelihood.precision**2 #dA - self.partial_for_likelihood += 0.5 * self.D * (self.psi0*self.likelihood.precision**2 - (self.psi2*self.Kmmi[None,:,:]*self.likelihood.precision[:,None,None]**2).sum(1).sum(1)/sf2) #dB - #self.partial_for_likelihood += 0.5 * self.D * np.sum(self.Bi*self.A)*self.likelihood.precision #dC - #self.partial_for_likelihood += -np.diag(np.dot((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) , self.psi1VVpsi1 ))*self.likelihood.precision #dD + if self.likelihood.Nparams ==0: + #save computation here. + self.partial_for_likelihood = None + elif self.likelihood.is_heteroscedastic: + raise NotImplementedError, "heteroscedatic derivates not implemented" + #self.partial_for_likelihood = - 0.5 * self.D*self.likelihood.precision + 0.5 * (self.likelihood.Y**2).sum(1)*self.likelihood.precision**2 #dA + #self.partial_for_likelihood += 0.5 * self.D * (self.psi0*self.likelihood.precision**2 - (self.psi2*self.Kmmi[None,:,:]*self.likelihood.precision[:,None,None]**2).sum(1).sum(1)/sf2) #dB + #self.partial_for_likelihood += 0.5 * self.D * np.sum(self.Bi*self.A)*self.likelihood.precision #dC + #self.partial_for_likelihood += -np.diag(np.dot((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) , self.psi1VVpsi1 ))*self.likelihood.precision #dD + else: + #likelihood is not heterscedatic + beta = self.likelihood.precision + dbeta = 0.5 * self.N*self.D/beta - 0.5 * np.sum(np.square(self.likelihood.Y)) + dbeta += - 0.5 * self.D * (self.psi0.sum() - np.trace(self.A)/beta*sf2) + dbeta += - 0.5 * self.D * np.sum(self.Bi*self.A)/beta + dbeta += np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/beta + self.partial_for_likelihood = -dbeta*self.likelihood.precision def _set_params(self, p): @@ -195,9 +208,9 @@ class sparse_GP(GP): def plot(self, *args, **kwargs): """ - Plot the fitted model: just call the GP_regression plot function and then add inducing inputs + Plot the fitted model: just call the GP plot function and then add inducing inputs """ - GP_regression.plot(self,*args,**kwargs) + GP.plot(self,*args,**kwargs) if self.Q==1: pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) if self.has_uncertain_inputs: From 7a5466068962fd3c57b694c78ffa1030902ec190 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Mon, 4 Feb 2013 10:29:01 +0000 Subject: [PATCH 41/44] fixed bug in my schoolboy mathematics --- GPy/likelihoods/Gaussian.py | 2 +- GPy/models/sparse_GP.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index ff358b2d..630b5d91 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -32,7 +32,7 @@ class Gaussian(likelihood): return ["noise variance"] def _set_params(self,x): - self._variance = x + self._variance = float(x) self.covariance_matrix = np.eye(self.N)*self._variance self.precision = 1./self._variance diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index d3592d69..6ba74e38 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -130,7 +130,7 @@ class sparse_GP(GP): dbeta += - 0.5 * self.D * (self.psi0.sum() - np.trace(self.A)/beta*sf2) dbeta += - 0.5 * self.D * np.sum(self.Bi*self.A)/beta dbeta += np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/beta - self.partial_for_likelihood = -dbeta*self.likelihood.precision + self.partial_for_likelihood = -dbeta*self.likelihood.precision**2 def _set_params(self, p): From 049e60f16ba87ed8db51524d6eb9f40efa640d01 Mon Sep 17 00:00:00 2001 From: Ricardo Andrade Date: Mon, 4 Feb 2013 12:01:27 +0000 Subject: [PATCH 42/44] var[:,None] added in full_cov = false, sparse_GP --- GPy/models/GP.py | 2 +- GPy/models/sparse_GP.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/GPy/models/GP.py b/GPy/models/GP.py index c640e529..2afa4252 100644 --- a/GPy/models/GP.py +++ b/GPy/models/GP.py @@ -133,7 +133,7 @@ class GP(model): KiKx = np.dot(self.Ki,Kx) if full_cov: Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) - var = Kxx - np.dot(KiKx.T,Kx) #NOTE is the shape of v right? + var = Kxx - np.dot(KiKx.T,Kx) #NOTE this won't work for plotting else: Kxx = self.kern.Kdiag(_Xnew, slices=slices) var = Kxx - np.sum(np.multiply(KiKx,Kx),0) diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py index 6ba74e38..a90f73cb 100644 --- a/GPy/models/sparse_GP.py +++ b/GPy/models/sparse_GP.py @@ -196,15 +196,14 @@ class sparse_GP(GP): Kx = self.kern.K(self.Z, Xnew) mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V) - if full_cov: Kxx = self.kern.K(Xnew) - var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE thiswon't work for plotting else: Kxx = self.kern.Kdiag(Xnew) var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) - return mu,var + return mu,var[:,None] def plot(self, *args, **kwargs): """ From 9b69b049337f325adf6762c155b3b135422ada20 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Mon, 4 Feb 2013 12:18:14 +0000 Subject: [PATCH 43/44] proper propagation of variance through the Gaussian likelihood --- GPy/likelihoods/Gaussian.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py index 630b5d91..a34b3e6c 100644 --- a/GPy/likelihoods/Gaussian.py +++ b/GPy/likelihoods/Gaussian.py @@ -42,8 +42,8 @@ class Gaussian(likelihood): """ mean = mu*self._std + self._mean true_var = (var + self._variance)*self._std**2 - _5pc = mean + - 2.*np.sqrt(var) - _95pc = mean + 2.*np.sqrt(var) + _5pc = mean + - 2.*np.sqrt(true_var) + _95pc = mean + 2.*np.sqrt(true_var) return mean, _5pc, _95pc def fit_full(self): From dacdaa1b418672dbecf9d70f5f8ae062862ed5ed Mon Sep 17 00:00:00 2001 From: James Hensman Date: Mon, 4 Feb 2013 12:36:08 +0000 Subject: [PATCH 44/44] simplified the checkgrad logic somewhat --- GPy/core/model.py | 110 +++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/GPy/core/model.py b/GPy/core/model.py index f26bf2ee..5e228b15 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -304,54 +304,62 @@ class model(parameterised): return '\n'.join(s) - def checkgrad(self, verbose=False, include_priors=False, step=1e-6, tolerance = 1e-3, return_ratio=False, *args): + def checkgrad(self, verbose=False, include_priors=False, step=1e-6, tolerance = 1e-3): """ Check the gradient of the model by comparing to a numerical estimate. - If the overall gradient fails, invividual components are tested. + If the verbose flag is passed, invividual components are tested (and printed) + + :param verbose: If True, print a "full" checking of each parameter + :type verbose: bool + :param step: The size of the step around which to linearise the objective + :type step: float (defaul 1e-6) + :param tolerance: the tolerance allowed (see note) + :type tolerance: float (default 1e-3) + + Note:- + The gradient is considered correct if the ratio of the analytical + and numerical gradients is within of unity. """ x = self._get_params_transformed().copy() - #choose a random direction to step in: - dx = step*np.sign(np.random.uniform(-1,1,x.size)) + if not verbose: + #just check the global ratio + dx = step*np.sign(np.random.uniform(-1,1,x.size)) - #evaulate around the point x - self._set_params_transformed(x+dx) - f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed() - self._set_params_transformed(x-dx) - f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed() - self._set_params_transformed(x) - gradient = self._log_likelihood_gradients_transformed() + #evaulate around the point x + self._set_params_transformed(x+dx) + f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed() + self._set_params_transformed(x-dx) + f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed() + self._set_params_transformed(x) + gradient = self._log_likelihood_gradients_transformed() - numerical_gradient = (f1-f2)/(2*dx) - global_ratio = (f1-f2)/(2*np.dot(dx,gradient)) - if verbose: - print "Gradient ratio = ", global_ratio, '\n' - sys.stdout.flush() + numerical_gradient = (f1-f2)/(2*dx) + global_ratio = (f1-f2)/(2*np.dot(dx,gradient)) - if (np.abs(1.-global_ratio)