diff --git a/GPy/__init__.py b/GPy/__init__.py index 381d6232..c0772c27 100644 --- a/GPy/__init__.py +++ b/GPy/__init__.py @@ -7,5 +7,5 @@ import models import inference import util import examples -#import examples TODO: discuss! from core import priors +import likelihoods diff --git a/GPy/core/model.py b/GPy/core/model.py index 145a607f..5e228b15 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -10,6 +10,7 @@ from parameterised import parameterised, truncate_pad import priors from ..util.linalg import jitchol from ..inference import optimization +from .. import likelihoods class model(parameterised): def __init__(self): @@ -303,54 +304,62 @@ class model(parameterised): return '\n'.join(s) - def checkgrad(self, verbose=False, include_priors=False, step=1e-6, tolerance = 1e-3, return_ratio=False, *args): + def checkgrad(self, verbose=False, include_priors=False, step=1e-6, tolerance = 1e-3): """ Check the gradient of the model by comparing to a numerical estimate. - If the overall gradient fails, invividual components are tested. + If the verbose flag is passed, invividual components are tested (and printed) + + :param verbose: If True, print a "full" checking of each parameter + :type verbose: bool + :param step: The size of the step around which to linearise the objective + :type step: float (defaul 1e-6) + :param tolerance: the tolerance allowed (see note) + :type tolerance: float (default 1e-3) + + Note:- + The gradient is considered correct if the ratio of the analytical + and numerical gradients is within of unity. """ x = self._get_params_transformed().copy() - #choose a random direction to step in: - dx = step*np.sign(np.random.uniform(-1,1,x.size)) + if not verbose: + #just check the global ratio + dx = step*np.sign(np.random.uniform(-1,1,x.size)) - #evaulate around the point x - self._set_params_transformed(x+dx) - f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed() - self._set_params_transformed(x-dx) - f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed() - self._set_params_transformed(x) - gradient = self._log_likelihood_gradients_transformed() + #evaulate around the point x + self._set_params_transformed(x+dx) + f1,g1 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed() + self._set_params_transformed(x-dx) + f2,g2 = self.log_likelihood() + self.log_prior(), self._log_likelihood_gradients_transformed() + self._set_params_transformed(x) + gradient = self._log_likelihood_gradients_transformed() - numerical_gradient = (f1-f2)/(2*dx) - global_ratio = (f1-f2)/(2*np.dot(dx,gradient)) - if verbose: - print "Gradient ratio = ", global_ratio, '\n' - sys.stdout.flush() + numerical_gradient = (f1-f2)/(2*dx) + global_ratio = (f1-f2)/(2*np.dot(dx,gradient)) - if (np.abs(1.-global_ratio) epsilon or not iteration: + print 'EM iteration %s' %iteration + self.update_likelihood_approximation() + self.optimize(**kwargs) + new_value = self.log_likelihood() + log_change = new_value - last_value + if log_change > epsilon: + self.log_likelihood_record.append(new_value) + self.gp_params_record.append(self._get_params()) + #self.ep_params_record.append((self.beta,self.Y,self.Z_ep)) + last_value = new_value else: - return False - - if return_ratio: - return global_ratio - else: - return True + convergence = False + #self.beta, self.Y, self.Z_ep = self.ep_params_record[-1] + self._set_params(self.gp_params_record[-1]) + print "Log-likelihood decrement: %s \nLast iteration discarded." %log_change + iteration += 1 diff --git a/GPy/examples/classification.py b/GPy/examples/classification.py index 989ed08a..c25ea124 100644 --- a/GPy/examples/classification.py +++ b/GPy/examples/classification.py @@ -3,16 +3,15 @@ """ -Simple Gaussian Processes classification +Gaussian Processes classification """ import pylab as pb import numpy as np import GPy default_seed=10000 -###################################### -## 2 dimensional example -def crescent_data(model_type='Full', inducing=10, seed=default_seed): + +def crescent_data(model_type='Full', inducing=10, seed=default_seed): #FIXME """Run a Gaussian process classification on the crescent data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. @@ -30,7 +29,7 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed): # create sparse GP EP model m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type) - m.approximate_likelihood() + m.update_likelihood_approximation() print(m) # optimize @@ -42,54 +41,66 @@ def crescent_data(model_type='Full', inducing=10, seed=default_seed): return m def oil(): - """Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood.""" + """ + Run a Gaussian process classification on the oil data. The demonstration calls the basic GP classification model and uses EP to approximate the likelihood. + """ data = GPy.util.datasets.oil() - likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) + # Kernel object + kernel = GPy.kern.rbf(12) - # create simple GP model - m = GPy.models.GP_EP(data['X'],likelihood) + # Likelihood object + distribution = GPy.likelihoods.likelihood_functions.probit() + likelihood = GPy.likelihoods.EP(data['Y'][:, 0:1],distribution) - # contrain all parameters to be positive + # Create GP model + m = GPy.models.GP(data['X'],kernel,likelihood=likelihood) + + # Contrain all parameters to be positive m.constrain_positive('') m.tie_param('lengthscale') - m.approximate_likelihood() + m.update_likelihood_approximation() - # optimize + # Optimize m.optimize() - # plot - #m.plot() print(m) return m -def toy_linear_1d_classification(model_type='Full', inducing=4, seed=default_seed): - """Simple 1D classification example. - :param model_type: type of model to fit ['Full', 'FITC', 'DTC']. +def toy_linear_1d_classification(seed=default_seed): + """ + Simple 1D classification example :param seed : seed value for data generation (default is 4). :type seed: int - :param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). :type inducing: int """ + data = GPy.util.datasets.toy_linear_1d_classification(seed=seed) - likelihood = GPy.inference.likelihoods.probit(data['Y'][:, 0:1]) - assert model_type in ('Full','DTC','FITC') - # create simple GP model - if model_type=='Full': - m = GPy.models.simple_GP_EP(data['X'],likelihood) - else: - # create sparse GP EP model - m = GPy.models.sparse_GP_EP(data['X'],likelihood=likelihood,inducing=inducing,ep_proxy=model_type) - + # Kernel object + kernel = GPy.kern.rbf(1) - m.constrain_positive('var') - m.constrain_positive('len') - m.tie_param('lengthscale') - m.approximate_likelihood() + # Likelihood object + distribution = GPy.likelihoods.likelihood_functions.probit() + likelihood = GPy.likelihoods.EP(data['Y'][:, 0:1],distribution) - # Optimize and plot - m.em(plot_all=False) # EM algorithm + # Model definition + m = GPy.models.GP(data['X'],likelihood=likelihood,kernel=kernel) + + # Optimize + """ + EPEM runs a loop that consists of two steps: + 1) EP likelihood approximation: + m.update_likelihood_approximation() + 2) Parameters optimization: + m.optimize() + """ + m.EPEM() + + # Plot + pb.subplot(211) + m.plot_internal() + pb.subplot(212) m.plot() - print(m) + return m diff --git a/GPy/examples/poisson.py b/GPy/examples/poisson.py new file mode 100644 index 00000000..934637f1 --- /dev/null +++ b/GPy/examples/poisson.py @@ -0,0 +1,48 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +""" +Simple Gaussian Processes classification +""" +import pylab as pb +import numpy as np +import GPy +pb.ion() + +pb.close('all') +default_seed=10000 + +model_type='Full' +inducing=4 +seed=default_seed +"""Simple 1D classification example. +:param model_type: type of model to fit ['Full', 'FITC', 'DTC']. +:param seed : seed value for data generation (default is 4). +:type seed: int +:param inducing : number of inducing variables (only used for 'FITC' or 'DTC'). +:type inducing: int +""" + +X = np.arange(0,100,5)[:,None] +F = np.round(np.sin(X/18.) + .1*X) + np.arange(5,25)[:,None] +E = np.random.randint(-5,5,20)[:,None] +Y = F + E +pb.figure() +likelihood = GPy.inference.likelihoods.poisson(Y,scale=1.) + +m = GPy.models.GP(X,likelihood=likelihood) +#m = GPy.models.GP(X,Y=likelihood.Y) + +m.constrain_positive('var') +m.constrain_positive('len') +m.tie_param('lengthscale') +if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): + m.approximate_likelihood() +print m.checkgrad() +# Optimize and plot +m.optimize() +#m.em(plot_all=False) # EM algorithm +m.plot(samples=4) + +print(m) diff --git a/GPy/examples/sparse_ep_fix.py b/GPy/examples/sparse_ep_fix.py new file mode 100644 index 00000000..defcb4eb --- /dev/null +++ b/GPy/examples/sparse_ep_fix.py @@ -0,0 +1,60 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +""" +Sparse Gaussian Processes regression with an RBF kernel +""" +import pylab as pb +import numpy as np +import GPy +np.random.seed(2) +pb.ion() +N = 500 +M = 5 + +pb.close('all') +###################################### +## 1 dimensional example + +# sample inputs and outputs +X = np.random.uniform(-3.,3.,(N,1)) +#Y = np.sin(X)+np.random.randn(N,1)*0.05 +F = np.sin(X)+np.random.randn(N,1)*0.05 +Y = np.ones([F.shape[0],1]) +Y[F<0] = -1 +likelihood = GPy.inference.likelihoods.probit(Y) + +# construct kernel +rbf = GPy.kern.rbf(1) +noise = GPy.kern.white(1) +kernel = rbf + noise + +# create simple GP model +#m = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood) + +# contrain all parameters to be positive +#m.constrain_fixed('prec',100.) +m = GPy.models.sparse_GP(X, Y, kernel, M=M) +m.ensure_default_constraints() +#if not isinstance(m.likelihood,GPy.inference.likelihoods.gaussian): +# m.approximate_likelihood() +print m.checkgrad() +m.optimize('tnc', messages = 1) +m.plot(samples=3) +print m + +n = GPy.models.sparse_GP(X,Y=None, kernel=kernel, M=M,likelihood= likelihood) +n.ensure_default_constraints() +if not isinstance(n.likelihood,GPy.inference.likelihoods.gaussian): + n.approximate_likelihood() +print n.checkgrad() +pb.figure() +n.plot() + +""" +m = GPy.models.sparse_GP_regression(X, Y, kernel, M=M) +m.ensure_default_constraints() +print m.checkgrad() +""" diff --git a/GPy/inference/Expectation_Propagation.py b/GPy/inference/Expectation_Propagation.py deleted file mode 100644 index 05453f1d..00000000 --- a/GPy/inference/Expectation_Propagation.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - - -import numpy as np -import random -from scipy import stats, linalg -from .likelihoods import likelihood -from ..core import model -from ..util.linalg import pdinv,mdot,jitchol -from ..util.plot import gpplot -from .. import kern - -class EP_base: - """ - Expectation Propagation. - - This is just the base class for expectation propagation. We'll extend it for full and sparse EP. - """ - def __init__(self,likelihood,epsilon=1e-3,powerep=[1.,1.]): - self.likelihood = likelihood - self.epsilon = epsilon - self.eta, self.delta = powerep - self.jitter = 1e-12 - - #Initial values - Likelihood approximation parameters: - #p(y|f) = t(f|tau_tilde,v_tilde) - self.restart_EP() - - def restart_EP(self): - """ - Set the EP approximation to initial state - """ - self.tau_tilde = np.zeros(self.N) - self.v_tilde = np.zeros(self.N) - self.mu = np.zeros(self.N) - -class Full(EP_base): - """ - :param likelihood: Output's likelihood (e.g. probit) - :type likelihood: GPy.inference.likelihood instance - :param K: prior covariance matrix - :type K: np.ndarray (N x N) - :param likelihood: Output's likelihood (e.g. probit) - :type likelihood: GPy.inference.likelihood instance - :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) - :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats) - """ - def __init__(self,K,likelihood,*args,**kwargs): - assert K.shape[0] == K.shape[1] - self.K = K - self.N = self.K.shape[0] - EP_base.__init__(self,likelihood,*args,**kwargs) - def fit_EP(self,messages=False): - """ - The expectation-propagation algorithm. - For nomenclature see Rasmussen & Williams 2006 (pag. 52-60) - """ - #Prior distribution parameters: p(f|X) = N(f|0,K) - #self.K = self.kernel.K(self.X,self.X) - - #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) - self.mu=np.zeros(self.N) - self.Sigma=self.K.copy() - - """ - Initial values - Cavity distribution parameters: - q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)} - sigma_ = 1./tau_ - mu_ = v_/tau_ - """ - - self.tau_ = np.empty(self.N,dtype=np.float64) - self.v_ = np.empty(self.N,dtype=np.float64) - - #Initial values - Marginal moments - z = np.empty(self.N,dtype=np.float64) - self.Z_hat = np.empty(self.N,dtype=np.float64) - phi = np.empty(self.N,dtype=np.float64) - mu_hat = np.empty(self.N,dtype=np.float64) - sigma2_hat = np.empty(self.N,dtype=np.float64) - - #Approximation - epsilon_np1 = self.epsilon + 1. - epsilon_np2 = self.epsilon + 1. - self.iterations = 0 - self.np1 = [self.tau_tilde.copy()] - self.np2 = [self.v_tilde.copy()] - while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.random.permutation(self.N) - for i in update_order: - #Cavity distribution parameters - self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i] - self.v_[i] = self.mu[i]/self.Sigma[i,i] - self.eta*self.v_tilde[i] - #Marginal moments - self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) - #Site parameters update - Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma[i,i]) - Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma[i,i]) - self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau - self.v_tilde[i] = self.v_tilde[i] + Delta_v - #Posterior distribution parameters update - si=self.Sigma[:,i].reshape(self.N,1) - self.Sigma = self.Sigma - Delta_tau/(1.+ Delta_tau*self.Sigma[i,i])*np.dot(si,si.T) - self.mu = np.dot(self.Sigma,self.v_tilde) - self.iterations += 1 - #Sigma recomptutation with Cholesky decompositon - Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*(self.K) - B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K - L = jitchol(B) - V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1) - self.Sigma = self.K - np.dot(V.T,V) - self.mu = np.dot(self.Sigma,self.v_tilde) - epsilon_np1 = np.mean(self.tau_tilde-self.np1[-1]**2) - epsilon_np2 = np.mean(self.v_tilde-self.np2[-1]**2) - self.np1.append(self.tau_tilde.copy()) - self.np2.append(self.v_tilde.copy()) - if messages: - print "EP iteration %i, epsiolon %d"%(self.iterations,epsilon_np1) - -class FITC(EP_base): - """ - :param likelihood: Output's likelihood (e.g. probit) - :type likelihood: GPy.inference.likelihood instance - :param Knn_diag: The diagonal elements of Knn is a 1D vector - :param Kmn: The 'cross' variance between inducing inputs and data - :param Kmm: the covariance matrix of the inducing inputs - :param likelihood: Output's likelihood (e.g. probit) - :type likelihood: GPy.inference.likelihood instance - :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) - :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats) - """ - def __init__(self,likelihood,Knn_diag,Kmn,Kmm,*args,**kwargs): - self.Knn_diag = Knn_diag - self.Kmn = Kmn - self.Kmm = Kmm - self.M = self.Kmn.shape[0] - self.N = self.Kmn.shape[1] - assert self.M <= self.N, 'The number of inducing inputs must be smaller than the number of observations' - assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N' - EP_base.__init__(self,likelihood,*args,**kwargs) - - def fit_EP(self): - """ - The expectation-propagation algorithm with sparse pseudo-input. - For nomenclature see Naish-Guzman and Holden, 2008. - """ - - """ - Prior approximation parameters: - q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) - Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn - """ - self.Kmmi, self.Kmm_hld = pdinv(self.Kmm) - self.P0 = self.Kmn.T - self.KmnKnm = np.dot(self.P0.T, self.P0) - self.KmmiKmn = np.dot(self.Kmmi,self.P0.T) - self.Qnn_diag = np.sum(self.P0.T*self.KmmiKmn,-2) - self.Diag0 = self.Knn_diag - self.Qnn_diag - self.R0 = jitchol(self.Kmmi).T - - """ - Posterior approximation: q(f|y) = N(f| mu, Sigma) - Sigma = Diag + P*R.T*R*P.T + K - mu = w + P*gamma - """ - self.w = np.zeros(self.N) - self.gamma = np.zeros(self.M) - self.mu = np.zeros(self.N) - self.P = self.P0.copy() - self.R = self.R0.copy() - self.Diag = self.Diag0.copy() - self.Sigma_diag = self.Knn_diag - - """ - Initial values - Cavity distribution parameters: - q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)} - sigma_ = 1./tau_ - mu_ = v_/tau_ - """ - self.tau_ = np.empty(self.N,dtype=np.float64) - self.v_ = np.empty(self.N,dtype=np.float64) - - #Initial values - Marginal moments - z = np.empty(self.N,dtype=np.float64) - self.Z_hat = np.empty(self.N,dtype=np.float64) - phi = np.empty(self.N,dtype=np.float64) - mu_hat = np.empty(self.N,dtype=np.float64) - sigma2_hat = np.empty(self.N,dtype=np.float64) - - #Approximation - epsilon_np1 = 1 - epsilon_np2 = 1 - self.iterations = 0 - self.np1 = [self.tau_tilde.copy()] - self.np2 = [self.v_tilde.copy()] - while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.arange(self.N) - random.shuffle(update_order) - for i in update_order: - #Cavity distribution parameters - self.tau_[i] = 1./self.Sigma_diag[i] - self.eta*self.tau_tilde[i] - self.v_[i] = self.mu[i]/self.Sigma_diag[i] - self.eta*self.v_tilde[i] - #Marginal moments - self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood.moments_match(i,self.tau_[i],self.v_[i]) - #Site parameters update - Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./self.Sigma_diag[i]) - Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - self.mu[i]/self.Sigma_diag[i]) - self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau - self.v_tilde[i] = self.v_tilde[i] + Delta_v - #Posterior distribution parameters update - dtd1 = Delta_tau*self.Diag[i] + 1. - dii = self.Diag[i] - self.Diag[i] = dii - (Delta_tau * dii**2.)/dtd1 - pi_ = self.P[i,:].reshape(1,self.M) - self.P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_ - Rp_i = np.dot(self.R,pi_.T) - RTR = np.dot(self.R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*self.Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),self.R)) - self.R = jitchol(RTR).T - self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1 - self.gamma = self.gamma + (Delta_v - Delta_tau*self.mu[i])*np.dot(RTR,self.P[i,:].T) - self.RPT = np.dot(self.R,self.P.T) - self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1) - self.mu = self.w + np.dot(self.P,self.gamma) - self.iterations += 1 - #Sigma recomptutation with Cholesky decompositon - self.Diag = self.Diag0/(1.+ self.Diag0 * self.tau_tilde) - self.P = (self.Diag / self.Diag0)[:,None] * self.P0 - self.RPT0 = np.dot(self.R0,self.P0.T) - L = jitchol(np.eye(self.M) + np.dot(self.RPT0,(1./self.Diag0 - self.Diag/(self.Diag0**2))[:,None]*self.RPT0.T)) - self.R,info = linalg.flapack.dtrtrs(L,self.R0,lower=1) - self.RPT = np.dot(self.R,self.P.T) - self.Sigma_diag = self.Diag + np.sum(self.RPT.T*self.RPT.T,-1) - self.w = self.Diag * self.v_tilde - self.gamma = np.dot(self.R.T, np.dot(self.RPT,self.v_tilde)) - self.mu = self.w + np.dot(self.P,self.gamma) - epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N - epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N - self.np1.append(self.tau_tilde.copy()) - self.np2.append(self.v_tilde.copy()) diff --git a/GPy/inference/likelihoods.py b/GPy/inference/likelihoods.py deleted file mode 100644 index c9b36e10..00000000 --- a/GPy/inference/likelihoods.py +++ /dev/null @@ -1,219 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - - -import numpy as np -from scipy import stats -import scipy as sp -import pylab as pb -from ..util.plot import gpplot - -class likelihood: - def __init__(self,Y,location=0,scale=1): - """ - Likelihood class for doing Expectation propagation - - :param Y: observed output (Nx1 numpy.darray) - ..Note:: Y values allowed depend on the likelihood used - """ - self.Y = Y - self.N = self.Y.shape[0] - self.location = location - self.scale = scale - - def plot1Da(self,X_new,Mean_new,Var_new,X_u,Mean_u,Var_u): - """ - Plot the predictive distribution of the GP model for 1-dimensional inputs - - :param X_new: The points at which to make a prediction - :param Mean_new: mean values at X_new - :param Var_new: variance values at X_new - :param X_u: input (inducing) points used to train the model - :param Mean_u: mean values at X_u - :param Var_new: variance values at X_u - """ - assert X_new.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X_new,Mean_new,Var_new) - pb.errorbar(X_u,Mean_u,2*np.sqrt(Var_u),fmt='r+') - pb.plot(X_u,Mean_u,'ro') - - def plot2D(self,X,X_new,F_new,U=None): - """ - Predictive distribution of the fitted GP model for 2-dimensional inputs - - :param X_new: The points at which to make a prediction - :param Mean_new: mean values at X_new - :param Var_new: variance values at X_new - :param X_u: input points used to train the model - :param Mean_u: mean values at X_u - :param Var_new: variance values at X_u - """ - N,D = X_new.shape - assert D == 2, 'Number of dimensions must be 2' - n = np.sqrt(N) - x1min = X_new[:,0].min() - x1max = X_new[:,0].max() - x2min = X_new[:,1].min() - x2max = X_new[:,1].max() - pb.imshow(F_new.reshape(n,n),extent=(x1min,x1max,x2max,x2min),vmin=0,vmax=1) - pb.colorbar() - C1 = np.arange(self.N)[self.Y.flatten()==1] - C2 = np.arange(self.N)[self.Y.flatten()==-1] - [pb.plot(X[i,0],X[i,1],'ro') for i in C1] - [pb.plot(X[i,0],X[i,1],'bo') for i in C2] - pb.xlim(x1min,x1max) - pb.ylim(x2min,x2max) - if U is not None: - [pb.plot(a,b,'wo') for a,b in U] - -class probit(likelihood): - """ - Probit likelihood - Y is expected to take values in {-1,1} - ----- - $$ - L(x) = \\Phi (Y_i*f_i) - $$ - """ - def moments_match(self,i,tau_i,v_i): - """ - Moments match of the marginal approximation in EP algorithm - - :param i: number of observation (int) - :param tau_i: precision of the cavity distribution (float) - :param v_i: mean/variance of the cavity distribution (float) - """ - z = self.Y[i]*v_i/np.sqrt(tau_i**2 + tau_i) - Z_hat = stats.norm.cdf(z) - phi = stats.norm.pdf(z) - mu_hat = v_i/tau_i + self.Y[i]*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i)) - sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat) - return Z_hat, mu_hat, sigma2_hat - - def plot1Db(self,X,X_new,F_new,U=None): - assert X.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X_new,F_new,np.zeros(X_new.shape[0])) - pb.plot(X,(self.Y+1)/2,'kx',mew=1.5) - pb.ylim(-0.2,1.2) - if U is not None: - pb.plot(U,U*0+.5,'r|',mew=1.5,markersize=12) - - def predictive_mean(self,mu,variance): - return stats.norm.cdf(mu/np.sqrt(1+variance)) - - def _log_likelihood_gradients(): - raise NotImplementedError - -class poisson(likelihood): - """ - Poisson likelihood - Y is expected to take values in {0,1,2,...} - ----- - $$ - L(x) = \exp(\lambda) * \lambda**Y_i / Y_i! - $$ - """ - def moments_match(self,i,tau_i,v_i): - """ - Moments match of the marginal approximation in EP algorithm - - :param i: number of observation (int) - :param tau_i: precision of the cavity distribution (float) - :param v_i: mean/variance of the cavity distribution (float) - """ - mu = v_i/tau_i - sigma = np.sqrt(1./tau_i) - def poisson_norm(f): - """ - Product of the likelihood and the cavity distribution - """ - pdf_norm_f = stats.norm.pdf(f,loc=mu,scale=sigma) - rate = np.exp( (f*self.scale)+self.location) - poisson = stats.poisson.pmf(float(self.Y[i]),rate) - return pdf_norm_f*poisson - - def log_pnm(f): - """ - Log of poisson_norm - """ - return -(-.5*(f-mu)**2/sigma**2 - np.exp( (f*self.scale)+self.location) + ( (f*self.scale)+self.location)*self.Y[i]) - - """ - Golden Search and Simpson's Rule - -------------------------------- - Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input. - Golden Search is used to find the mode in the poisson_norm distribution and define around it the grid for Simpson's Rule - """ - #TODO golden search & simpson's rule can be defined in the general likelihood class, rather than in each specific case. - - #Golden search - golden_A = -1 if self.Y[i] == 0 else np.array([np.log(self.Y[i]),mu]).min() #Lower limit - golden_B = np.array([np.log(self.Y[i]),mu]).max() #Upper limit - golden_A = (golden_A - self.location)/self.scale - golden_B = (golden_B - self.location)/self.scale - opt = sp.optimize.golden(log_pnm,brack=(golden_A,golden_B)) #Better to work with log_pnm than with poisson_norm - - # Simpson's approximation - width = 3./np.log(max(self.Y[i],2)) - A = opt - width #Lower limit - B = opt + width #Upper limit - K = 10*int(np.log(max(self.Y[i],150))) #Number of points in the grid, we DON'T want K to be the same number for every case - h = (B-A)/K # length of the intervals - grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis) - x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier - zeroth = np.hstack([poisson_norm(A),poisson_norm(B),[4*poisson_norm(f) for f in grid_x[range(1,K,2)]],[2*poisson_norm(f) for f in grid_x[range(2,K-1,2)]]]) # grid of points (Y axis) rearranged like x - first = zeroth*x - second = first*x - Z_hat = sum(zeroth)*h/3 # Zero-th moment - mu_hat = sum(first)*h/(3*Z_hat) # First moment - m2 = sum(second)*h/(3*Z_hat) # Second moment - sigma2_hat = m2 - mu_hat**2 # Second central moment - return float(Z_hat), float(mu_hat), float(sigma2_hat) - - def plot1Db(self,X,X_new,F_new,F2_new=None,U=None): - pb.subplot(212) - #gpplot(X_new,F_new,np.sqrt(F2_new)) - pb.plot(X_new,F_new)#,np.sqrt(F2_new)) #FIXME - pb.plot(X,self.Y,'kx',mew=1.5) - if U is not None: - pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12) - def predictive_mean(self,mu,variance): - return np.exp(mu*self.scale + self.location) - def predictive_variance(self,mu,variance): - return mu - def _log_likelihood_gradients(): - raise NotImplementedError - -class gaussian(likelihood): - """ - Gaussian likelihood - Y is expected to take values in (-inf,inf) - """ - def moments_match(self,i,tau_i,v_i): - """ - Moments match of the marginal approximation in EP algorithm - - :param i: number of observation (int) - :param tau_i: precision of the cavity distribution (float) - :param v_i: mean/variance of the cavity distribution (float) - """ - mu = v_i/tau_i - sigma = np.sqrt(1./tau_i) - s = 1. if self.Y[i] == 0 else 1./self.Y[i] - sigma2_hat = 1./(1./sigma**2 + 1./s**2) - mu_hat = sigma2_hat*(mu/sigma**2 + self.Y[i]/s**2) - Z_hat = 1./np.sqrt(2*np.pi) * 1./np.sqrt(sigma**2+s**2) * np.exp(-.5*(mu-self.Y[i])**2/(sigma**2 + s**2)) - return Z_hat, mu_hat, sigma2_hat - - def plot1Db(self,X,X_new,F_new,U=None): - assert X.shape[1] == 1, 'Number of dimensions must be 1' - gpplot(X_new,F_new,np.zeros(X_new.shape[0])) - pb.plot(X,self.Y,'kx',mew=1.5) - if U is not None: - pb.plot(U,np.ones(U.shape[0])*self.Y.min()*.8,'r|',mew=1.5,markersize=12) - - def predictive_mean(self,mu,Sigma): - return mu - - def _log_likelihood_gradients(): - raise NotImplementedError diff --git a/GPy/likelihoods/EP.py b/GPy/likelihoods/EP.py new file mode 100644 index 00000000..efd887ae --- /dev/null +++ b/GPy/likelihoods/EP.py @@ -0,0 +1,311 @@ +import numpy as np +from scipy import stats, linalg +from ..util.linalg import pdinv,mdot,jitchol +from likelihood import likelihood + +class EP(likelihood): + def __init__(self,data,likelihood_function,epsilon=1e-3,power_ep=[1.,1.]): + """ + Expectation Propagation + + Arguments + --------- + epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) + likelihood_function : a likelihood function (see likelihood_functions.py) + """ + self.likelihood_function = likelihood_function + self.epsilon = epsilon + self.eta, self.delta = power_ep + self.data = data + self.N = self.data.size + self.is_heteroscedastic = True + self.Nparams = 0 + + #Initial values - Likelihood approximation parameters: + #p(y|f) = t(f|tau_tilde,v_tilde) + self.tau_tilde = np.zeros(self.N) + self.v_tilde = np.zeros(self.N) + + #initial values for the GP variables + self.Y = np.zeros((self.N,1)) + self.covariance_matrix = np.eye(self.N) + self.precision = np.ones(self.N) + self.Z = 0 + self.YYT = None + + def predictive_values(self,mu,var): + return self.likelihood_function.predictive_values(mu,var) + + def _get_params(self): + return np.zeros(0) + def _get_param_names(self): + return [] + def _set_params(self,p): + pass # TODO: the EP likelihood might want to take some parameters... + def _gradients(self,partial): + return np.zeros(0) # TODO: the EP likelihood might want to take some parameters... + + def _compute_GP_variables(self): + #Variables to be called from GP + mu_tilde = self.v_tilde/self.tau_tilde #When calling EP, this variable is used instead of Y in the GP model + sigma_sum = 1./self.tau_ + 1./self.tau_tilde + mu_diff_2 = (self.v_/self.tau_ - mu_tilde)**2 + self.Z = np.sum(np.log(self.Z_hat)) + 0.5*np.sum(np.log(sigma_sum)) + 0.5*np.sum(mu_diff_2/sigma_sum) #Normalization constant, aka Z_ep + + self.Y = mu_tilde[:,None] + self.YYT = np.dot(self.Y,self.Y.T) + self.precision = self.tau_tilde + self.covariance_matrix = np.diag(1./self.precision) + + def fit_full(self,K): + """ + The expectation-propagation algorithm. + For nomenclature see Rasmussen & Williams 2006. + """ + #Prior distribution parameters: p(f|X) = N(f|0,K) + + self.tau_tilde = np.zeros(self.N) + self.v_tilde = np.zeros(self.N) + #Initial values - Posterior distribution parameters: q(f|X,Y) = N(f|mu,Sigma) + mu = np.zeros(self.N) + Sigma = K.copy() + + """ + Initial values - Cavity distribution parameters: + q_(f|mu_,sigma2_) = Product{q_i(f|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + self.tau_ = np.empty(self.N,dtype=float) + self.v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + self.Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + + #Approximation + epsilon_np1 = self.epsilon + 1. + epsilon_np2 = self.epsilon + 1. + self.iterations = 0 + self.np1 = [self.tau_tilde.copy()] + self.np2 = [self.v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.random.permutation(self.N) + for i in update_order: + #Cavity distribution parameters + self.tau_[i] = 1./Sigma[i,i] - self.eta*self.tau_tilde[i] + self.v_[i] = mu[i]/Sigma[i,i] - self.eta*self.v_tilde[i] + #Marginal moments + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],self.tau_[i],self.v_[i]) + #Site parameters update + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma[i,i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma[i,i]) + self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau + self.v_tilde[i] = self.v_tilde[i] + Delta_v + #Posterior distribution parameters update + si=Sigma[:,i].reshape(self.N,1) + Sigma = Sigma - Delta_tau/(1.+ Delta_tau*Sigma[i,i])*np.dot(si,si.T) + mu = np.dot(Sigma,self.v_tilde) + self.iterations += 1 + #Sigma recomptutation with Cholesky decompositon + Sroot_tilde_K = np.sqrt(self.tau_tilde)[:,None]*K + B = np.eye(self.N) + np.sqrt(self.tau_tilde)[None,:]*Sroot_tilde_K + L = jitchol(B) + V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1) + Sigma = K - np.dot(V.T,V) + mu = np.dot(Sigma,self.v_tilde) + epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N + epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N + self.np1.append(self.tau_tilde.copy()) + self.np2.append(self.v_tilde.copy()) + + return self._compute_GP_variables() + + def fit_DTC(self, Knn_diag, Kmn, Kmm): + """ + The expectation-propagation algorithm with sparse pseudo-input. + For nomenclature see ... 2013. + """ + + #TODO: this doesn;t work with uncertain inputs! + + """ + Prior approximation parameters: + q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) + Sigma0 = Qnn = Knm*Kmmi*Kmn + """ + Kmmi, Lm, Lmi, Kmm_logdet = pdinv(Kmm) + KmnKnm = np.dot(Kmn, Kmn.T) + KmmiKmn = np.dot(Kmmi,Kmn) + Qnn_diag = np.sum(Kmn*KmmiKmn,-2) + LLT0 = Kmm.copy() + + """ + Posterior approximation: q(f|y) = N(f| mu, Sigma) + Sigma = Diag + P*R.T*R*P.T + K + mu = w + P*gamma + """ + mu = np.zeros(self.N) + LLT = Kmm.copy() + Sigma_diag = Qnn_diag.copy() + + """ + Initial values - Cavity distribution parameters: + q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + tau_ = np.empty(self.N,dtype=float) + v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + + #Approximation + epsilon_np1 = 1 + epsilon_np2 = 1 + self.iterations = 0 + np1 = [tau_tilde.copy()] + np2 = [v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.random.permutation(self.N) + for i in update_order: + #Cavity distribution parameters + tau_[i] = 1./Sigma_diag[i] - self.eta*tau_tilde[i] + v_[i] = mu[i]/Sigma_diag[i] - self.eta*v_tilde[i] + #Marginal moments + Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(self.data[i],tau_[i],v_[i]) + #Site parameters update + Delta_tau = delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i]) + tau_tilde[i] = tau_tilde[i] + Delta_tau + v_tilde[i] = v_tilde[i] + Delta_v + #Posterior distribution parameters update + LLT = LLT + np.outer(Kmn[:,i],Kmn[:,i])*Delta_tau + L = jitchol(LLT) + V,info = linalg.flapack.dtrtrs(L,Kmn,lower=1) + Sigma_diag = np.sum(V*V,-2) + si = np.sum(V.T*V[:,i],-1) + mu = mu + (Delta_v-Delta_tau*mu[i])*si + self.iterations += 1 + #Sigma recomputation with Cholesky decompositon + LLT0 = LLT0 + np.dot(Kmn*tau_tilde[None,:],Kmn.T) + L = jitchol(LLT) + V,info = linalg.flapack.dtrtrs(L,Kmn,lower=1) + V2,info = linalg.flapack.dtrtrs(L.T,V,lower=0) + Sigma_diag = np.sum(V*V,-2) + Knmv_tilde = np.dot(Kmn,v_tilde) + mu = np.dot(V2.T,Knmv_tilde) + epsilon_np1 = sum((tau_tilde-np1[-1])**2)/self.N + epsilon_np2 = sum((v_tilde-np2[-1])**2)/self.N + np1.append(tau_tilde.copy()) + np2.append(v_tilde.copy()) + + self._compute_GP_variables() + + def fit_FITC(self, Knn_diag, Kmn): + """ + The expectation-propagation algorithm with sparse pseudo-input. + For nomenclature see Naish-Guzman and Holden, 2008. + """ + + """ + Prior approximation parameters: + q(f|X) = int_{df}{N(f|KfuKuu_invu,diag(Kff-Qff)*N(u|0,Kuu)} = N(f|0,Sigma0) + Sigma0 = diag(Knn-Qnn) + Qnn, Qnn = Knm*Kmmi*Kmn + """ + Kmmi, self.Lm, self.Lmi, Kmm_logdet = pdinv(Kmm) + P0 = Kmn.T + KmnKnm = np.dot(P0.T, P0) + KmmiKmn = np.dot(Kmmi,P0.T) + Qnn_diag = np.sum(P0.T*KmmiKmn,-2) + Diag0 = Knn_diag - Qnn_diag + R0 = jitchol(Kmmi).T + + """ + Posterior approximation: q(f|y) = N(f| mu, Sigma) + Sigma = Diag + P*R.T*R*P.T + K + mu = w + P*gamma + """ + self.w = np.zeros(self.N) + self.gamma = np.zeros(self.M) + mu = np.zeros(self.N) + P = P0.copy() + R = R0.copy() + Diag = Diag0.copy() + Sigma_diag = Knn_diag + + """ + Initial values - Cavity distribution parameters: + q_(g|mu_,sigma2_) = Product{q_i(g|mu_i,sigma2_i)} + sigma_ = 1./tau_ + mu_ = v_/tau_ + """ + self.tau_ = np.empty(self.N,dtype=float) + self.v_ = np.empty(self.N,dtype=float) + + #Initial values - Marginal moments + z = np.empty(self.N,dtype=float) + self.Z_hat = np.empty(self.N,dtype=float) + phi = np.empty(self.N,dtype=float) + mu_hat = np.empty(self.N,dtype=float) + sigma2_hat = np.empty(self.N,dtype=float) + + #Approximation + epsilon_np1 = 1 + epsilon_np2 = 1 + self.iterations = 0 + self.np1 = [self.tau_tilde.copy()] + self.np2 = [self.v_tilde.copy()] + while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: + update_order = np.random.permutation(self.N) + for i in update_order: + #Cavity distribution parameters + self.tau_[i] = 1./Sigma_diag[i] - self.eta*self.tau_tilde[i] + self.v_[i] = mu[i]/Sigma_diag[i] - self.eta*self.v_tilde[i] + #Marginal moments + self.Z_hat[i], mu_hat[i], sigma2_hat[i] = self.likelihood_function.moments_match(data[i],self.tau_[i],self.v_[i]) + #Site parameters update + Delta_tau = self.delta/self.eta*(1./sigma2_hat[i] - 1./Sigma_diag[i]) + Delta_v = self.delta/self.eta*(mu_hat[i]/sigma2_hat[i] - mu[i]/Sigma_diag[i]) + self.tau_tilde[i] = self.tau_tilde[i] + Delta_tau + self.v_tilde[i] = self.v_tilde[i] + Delta_v + #Posterior distribution parameters update + dtd1 = Delta_tau*Diag[i] + 1. + dii = Diag[i] + Diag[i] = dii - (Delta_tau * dii**2.)/dtd1 + pi_ = P[i,:].reshape(1,self.M) + P[i,:] = pi_ - (Delta_tau*dii)/dtd1 * pi_ + Rp_i = np.dot(R,pi_.T) + RTR = np.dot(R.T,np.dot(np.eye(self.M) - Delta_tau/(1.+Delta_tau*Sigma_diag[i]) * np.dot(Rp_i,Rp_i.T),R)) + R = jitchol(RTR).T + self.w[i] = self.w[i] + (Delta_v - Delta_tau*self.w[i])*dii/dtd1 + self.gamma = self.gamma + (Delta_v - Delta_tau*mu[i])*np.dot(RTR,P[i,:].T) + RPT = np.dot(R,P.T) + Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1) + mu = self.w + np.dot(P,self.gamma) + self.iterations += 1 + #Sigma recomptutation with Cholesky decompositon + Diag = Diag0/(1.+ Diag0 * self.tau_tilde) + P = (Diag / Diag0)[:,None] * P0 + RPT0 = np.dot(R0,P0.T) + L = jitchol(np.eye(self.M) + np.dot(RPT0,(1./Diag0 - Diag/(Diag0**2))[:,None]*RPT0.T)) + R,info = linalg.flapack.dtrtrs(L,R0,lower=1) + RPT = np.dot(R,P.T) + Sigma_diag = Diag + np.sum(RPT.T*RPT.T,-1) + self.w = Diag * self.v_tilde + self.gamma = np.dot(R.T, np.dot(RPT,self.v_tilde)) + mu = self.w + np.dot(P,self.gamma) + epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N + epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N + self.np1.append(self.tau_tilde.copy()) + self.np2.append(self.v_tilde.copy()) + + return self._compute_GP_variables() diff --git a/GPy/likelihoods/Gaussian.py b/GPy/likelihoods/Gaussian.py new file mode 100644 index 00000000..a34b3e6c --- /dev/null +++ b/GPy/likelihoods/Gaussian.py @@ -0,0 +1,56 @@ +import numpy as np +from likelihood import likelihood + +class Gaussian(likelihood): + def __init__(self,data,variance=1.,normalize=False): + self.is_heteroscedastic = False + self.Nparams = 1 + self.data = data + self.N,D = data.shape + self.Z = 0. # a correction factor which accounts for the approximation made + + #normalisation + if normalize: + self._mean = data.mean(0)[None,:] + self._std = data.std(0)[None,:] + self.Y = (self.data - self._mean)/self._std + else: + self._mean = np.zeros((1,D)) + self._std = np.ones((1,D)) + self.Y = self.data + + #TODO: make this work efficiently (only compute YYT if D>>N) + self.YYT = np.dot(self.Y,self.Y.T) + self.trYYT = np.trace(self.YYT) + self._set_params(np.asarray(variance)) + + + def _get_params(self): + return np.asarray(self._variance) + + def _get_param_names(self): + return ["noise variance"] + + def _set_params(self,x): + self._variance = float(x) + self.covariance_matrix = np.eye(self.N)*self._variance + self.precision = 1./self._variance + + def predictive_values(self,mu,var): + """ + Un-normalise the prediction and add the likelihood variance, then return the 5%, 95% interval + """ + mean = mu*self._std + self._mean + true_var = (var + self._variance)*self._std**2 + _5pc = mean + - 2.*np.sqrt(true_var) + _95pc = mean + 2.*np.sqrt(true_var) + return mean, _5pc, _95pc + + def fit_full(self): + """ + No approximations needed + """ + pass + + def _gradients(self,partial): + return np.sum(partial) diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py new file mode 100644 index 00000000..83413255 --- /dev/null +++ b/GPy/likelihoods/__init__.py @@ -0,0 +1,4 @@ +from EP import EP +from Gaussian import Gaussian +# TODO: from Laplace import Laplace +import likelihood_functions as functions diff --git a/GPy/likelihoods/likelihood.py b/GPy/likelihoods/likelihood.py new file mode 100644 index 00000000..6ec57c07 --- /dev/null +++ b/GPy/likelihoods/likelihood.py @@ -0,0 +1,35 @@ +import numpy as np + +class likelihood: + """ + The atom for a likelihood class + + This object interfaces the GP and the data. The most basic likelihood + (Gaussian) inherits directly from this, as does the EP algorithm + + Some things must be defined for this to work properly: + self.Y : the effective Gaussian target of the GP + self.N, self.D : Y.shape + self.covariance_matrix : the effective (noise) covariance of the GP targets + self.Z : a factor which gets added to the likelihood (0 for a Gaussian, Z_EP for EP) + self.is_heteroscedastic : enables significant computational savings in GP + self.precision : a scalar or vector representation of the effective target precision + self.YYT : (optional) = np.dot(self.Y, self.Y.T) enables computational savings for D>N + """ + def __init__(self,data): + raise ValueError, "this class is not to be instantiated" + + def _get_params(self): + raise NotImplementedError + + def _get_param_names(self): + raise NotImplementedError + + def _set_params(self,x): + raise NotImplementedError + + def fit(self): + raise NotImplementedError + + def _gradients(self,partial): + raise NotImplementedError diff --git a/GPy/likelihoods/likelihood_functions.py b/GPy/likelihoods/likelihood_functions.py new file mode 100644 index 00000000..de97824a --- /dev/null +++ b/GPy/likelihoods/likelihood_functions.py @@ -0,0 +1,133 @@ +# Copyright (c) 2012, 2013 Ricardo Andrade +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +from scipy import stats +import scipy as sp +import pylab as pb +from ..util.plot import gpplot + +class likelihood_function: + """ + Likelihood class for doing Expectation propagation + + :param Y: observed output (Nx1 numpy.darray) + ..Note:: Y values allowed depend on the likelihood_function used + """ + def __init__(self,location=0,scale=1): + self.location = location + self.scale = scale + +class probit(likelihood_function): + """ + Probit likelihood + Y is expected to take values in {-1,1} + ----- + $$ + L(x) = \\Phi (Y_i*f_i) + $$ + """ + + def moments_match(self,data_i,tau_i,v_i): + """ + Moments match of the marginal approximation in EP algorithm + + :param i: number of observation (int) + :param tau_i: precision of the cavity distribution (float) + :param v_i: mean/variance of the cavity distribution (float) + """ + # TODO: some version of assert np.sum(np.abs(Y)-1) == 0, "Output values must be either -1 or 1" + z = data_i*v_i/np.sqrt(tau_i**2 + tau_i) + Z_hat = stats.norm.cdf(z) + phi = stats.norm.pdf(z) + mu_hat = v_i/tau_i + data_i*phi/(Z_hat*np.sqrt(tau_i**2 + tau_i)) + sigma2_hat = 1./tau_i - (phi/((tau_i**2+tau_i)*Z_hat))*(z+phi/Z_hat) + return Z_hat, mu_hat, sigma2_hat + + def predictive_values(self,mu,var): + """ + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction + """ + mu = mu.flatten() + var = var.flatten() + mean = stats.norm.cdf(mu/np.sqrt(1+var)) + p_05 = np.zeros(mu.shape)#np.zeros([mu.size]) + p_95 = np.zeros(mu.shape)#np.ones([mu.size]) + return mean, p_05, p_95 + +class Poisson(likelihood_function): + """ + Poisson likelihood + Y is expected to take values in {0,1,2,...} + ----- + $$ + L(x) = \exp(\lambda) * \lambda**Y_i / Y_i! + $$ + """ + def moments_match(self,i,tau_i,v_i): + """ + Moments match of the marginal approximation in EP algorithm + + :param i: number of observation (int) + :param tau_i: precision of the cavity distribution (float) + :param v_i: mean/variance of the cavity distribution (float) + """ + mu = v_i/tau_i + sigma = np.sqrt(1./tau_i) + def poisson_norm(f): + """ + Product of the likelihood and the cavity distribution + """ + pdf_norm_f = stats.norm.pdf(f,loc=mu,scale=sigma) + rate = np.exp( (f*self.scale)+self.location) + poisson = stats.poisson.pmf(float(self.Y[i]),rate) + return pdf_norm_f*poisson + + def log_pnm(f): + """ + Log of poisson_norm + """ + return -(-.5*(f-mu)**2/sigma**2 - np.exp( (f*self.scale)+self.location) + ( (f*self.scale)+self.location)*self.Y[i]) + + """ + Golden Search and Simpson's Rule + -------------------------------- + Simpson's Rule is used to calculate the moments mumerically, it needs a grid of points as input. + Golden Search is used to find the mode in the poisson_norm distribution and define around it the grid for Simpson's Rule + """ + #TODO golden search & simpson's rule can be defined in the general likelihood class, rather than in each specific case. + + #Golden search + golden_A = -1 if self.Y[i] == 0 else np.array([np.log(self.Y[i]),mu]).min() #Lower limit + golden_B = np.array([np.log(self.Y[i]),mu]).max() #Upper limit + golden_A = (golden_A - self.location)/self.scale + golden_B = (golden_B - self.location)/self.scale + opt = sp.optimize.golden(log_pnm,brack=(golden_A,golden_B)) #Better to work with log_pnm than with poisson_norm + + # Simpson's approximation + width = 3./np.log(max(self.Y[i],2)) + A = opt - width #Lower limit + B = opt + width #Upper limit + K = 10*int(np.log(max(self.Y[i],150))) #Number of points in the grid, we DON'T want K to be the same number for every case + h = (B-A)/K # length of the intervals + grid_x = np.hstack([np.linspace(opt-width,opt,K/2+1)[1:-1], np.linspace(opt,opt+width,K/2+1)]) # grid of points (X axis) + x = np.hstack([A,B,grid_x[range(1,K,2)],grid_x[range(2,K-1,2)]]) # grid_x rearranged, just to make Simpson's algorithm easier + zeroth = np.hstack([poisson_norm(A),poisson_norm(B),[4*poisson_norm(f) for f in grid_x[range(1,K,2)]],[2*poisson_norm(f) for f in grid_x[range(2,K-1,2)]]]) # grid of points (Y axis) rearranged like x + first = zeroth*x + second = first*x + Z_hat = sum(zeroth)*h/3 # Zero-th moment + mu_hat = sum(first)*h/(3*Z_hat) # First moment + m2 = sum(second)*h/(3*Z_hat) # Second moment + sigma2_hat = m2 - mu_hat**2 # Second central moment + return float(Z_hat), float(mu_hat), float(sigma2_hat) + + def predictive_values(self,mu,var): + """ + Compute mean, and conficence interval (percentiles 5 and 95) of the prediction + """ + mean = np.exp(mu*self.scale + self.location) + tmp = stats.poisson.ppf(np.array([.05,.95]),mu) + p_05 = tmp[:,0] + p_95 = tmp[:,1] + return mean,p_05,p_95 diff --git a/GPy/models/BGPLVM.py b/GPy/models/BGPLVM.py index db147944..ffa7df54 100644 --- a/GPy/models/BGPLVM.py +++ b/GPy/models/BGPLVM.py @@ -5,10 +5,12 @@ import numpy as np import pylab as pb import sys, pdb from GPLVM import GPLVM -from sparse_GP_regression import sparse_GP_regression +from sparse_GP import sparse_GP from GPy.util.linalg import pdinv +from ..likelihoods import Gaussian +from .. import kern -class Bayesian_GPLVM(sparse_GP_regression, GPLVM): +class Bayesian_GPLVM(sparse_GP, GPLVM): """ Bayesian Gaussian Process Latent Variable Model @@ -20,15 +22,23 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM): :type init: 'PCA'|'random' """ - def __init__(self, Y, Q, init='PCA', **kwargs): + def __init__(self, Y, Q, init='PCA', M=10, Z=None, **kwargs): X = self.initialise_latent(init, Q, Y) - S = np.ones_like(X) * 1e-2# - sparse_GP_regression.__init__(self, X, Y, X_uncertainty = S, **kwargs) + + if Z is None: + Z = np.random.permutation(X.copy())[:M] + else: + assert Z.shape[1]==X.shape[1] + + kernel = kern.rbf(Q) + kern.white(Q) + + S = np.ones_like(X) * 1e-2# + sparse_GP.__init__(self, X, Gaussian(Y), X_uncertainty = S, Z=Z,**kwargs) def _get_param_names(self): X_names = sum([['X_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[]) S_names = sum([['S_%i_%i'%(n,q) for n in range(self.N)] for q in range(self.Q)],[]) - return (X_names + S_names + sparse_GP_regression._get_param_names(self)) + return (X_names + S_names + sparse_GP._get_param_names(self)) def _get_params(self): """ @@ -40,13 +50,13 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM): =============================================================== """ - return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP_regression._get_params(self))) + return np.hstack((self.X.flatten(), self.X_uncertainty.flatten(), sparse_GP._get_params(self))) def _set_params(self,x): N, Q = self.N, self.Q self.X = x[:self.X.size].reshape(N,Q).copy() self.X_uncertainty = x[(N*Q):(2*N*Q)].reshape(N,Q).copy() - sparse_GP_regression._set_params(self, x[(2*N*Q):]) + sparse_GP._set_params(self, x[(2*N*Q):]) def dL_dmuS(self): dL_dmu_psi0, dL_dS_psi0 = self.kern.dpsi1_dmuS(self.dL_dpsi1,self.Z,self.X,self.X_uncertainty) @@ -58,5 +68,5 @@ class Bayesian_GPLVM(sparse_GP_regression, GPLVM): return np.hstack((dL_dmu.flatten(), dL_dS.flatten())) def _log_likelihood_gradients(self): - return np.hstack((self.dL_dmuS().flatten(), sparse_GP_regression._log_likelihood_gradients(self))) + return np.hstack((self.dL_dmuS().flatten(), sparse_GP._log_likelihood_gradients(self))) diff --git a/GPy/models/GP.py b/GPy/models/GP.py new file mode 100644 index 00000000..2afa4252 --- /dev/null +++ b/GPy/models/GP.py @@ -0,0 +1,248 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import numpy as np +import pylab as pb +from .. import kern +from ..core import model +from ..util.linalg import pdinv,mdot +from ..util.plot import gpplot,x_frame1D,x_frame2D, Tango +from ..likelihoods import EP + +class GP(model): + """ + Gaussian Process model for regression and EP + + :param X: input observations + :param kernel: a GPy kernel, defaults to rbf+white + :parm likelihood: a GPy likelihood + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) + :rtype: model object + :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 + :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] + :type powerep: list + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self, X, likelihood, kernel, normalize_X=False, Xslices=None): + + # parse arguments + self.Xslices = Xslices + self.X = X + assert len(self.X.shape)==2 + self.N, self.Q = self.X.shape + assert isinstance(kernel, kern.kern) + self.kern = kernel + + #here's some simple normalisation for the inputs + if normalize_X: + self._Xmean = X.mean(0)[None,:] + self._Xstd = X.std(0)[None,:] + self.X = (X.copy() - self._Xmean) / self._Xstd + if hasattr(self,'Z'): + self.Z = (self.Z - self._Xmean) / self._Xstd + else: + self._Xmean = np.zeros((1,self.X.shape[1])) + self._Xstd = np.ones((1,self.X.shape[1])) + + self.likelihood = likelihood + #assert self.X.shape[0] == self.likelihood.Y.shape[0] + #self.N, self.D = self.likelihood.Y.shape + assert self.X.shape[0] == self.likelihood.data.shape[0] + self.N, self.D = self.likelihood.data.shape + + model.__init__(self) + + def _set_params(self,p): + self.kern._set_params_transformed(p[:self.kern.Nparam]) + self.likelihood._set_params(p[self.kern.Nparam:]) + + self.K = self.kern.K(self.X,slices1=self.Xslices) + self.K += self.likelihood.covariance_matrix + + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) + + #the gradient of the likelihood wrt the covariance matrix + if self.likelihood.YYT is None: + alpha = np.dot(self.Ki,self.likelihood.Y) + self.dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) + else: + tmp = mdot(self.Ki, self.likelihood.YYT, self.Ki) + self.dL_dK = 0.5*(tmp - self.D*self.Ki) + + def _get_params(self): + return np.hstack((self.kern._get_params_transformed(), self.likelihood._get_params())) + + def _get_param_names(self): + return self.kern._get_param_names_transformed() + self.likelihood._get_param_names() + + def update_likelihood_approximation(self): + """ + Approximates a non-gaussian likelihood using Expectation Propagation + + For a Gaussian (or direct: TODO) likelihood, no iteration is required: + this function does nothing + """ + self.likelihood.fit_full(self.kern.K(self.X)) + self._set_params(self._get_params()) # update the GP + + def _model_fit_term(self): + """ + Computes the model fit using YYT if it's available + """ + if self.likelihood.YYT is None: + return -0.5*np.sum(np.square(np.dot(self.Li,self.likelihood.Y))) + else: + return -0.5*np.sum(np.multiply(self.Ki, self.likelihood.YYT)) + + def log_likelihood(self): + """ + The log marginal likelihood of the GP. + + For an EP model, can be written as the log likelihood of a regression + model for a new variable Y* = v_tilde/tau_tilde, with a covariance + matrix K* = K + diag(1./tau_tilde) plus a normalization term. + """ + return -0.5*self.D*self.K_logdet + self._model_fit_term() + self.likelihood.Z + + + def _log_likelihood_gradients(self): + """ + The gradient of all parameters. + + For the kernel parameters, use the chain rule via dL_dK + + For the likelihood parameters, pass in alpha = K^-1 y + """ + return np.hstack((self.kern.dK_dtheta(partial=self.dL_dK,X=self.X), self.likelihood._gradients(partial=np.diag(self.dL_dK)))) + + def _raw_predict(self,_Xnew,slices=None, full_cov=False): + """ + Internal helper function for making predictions, does not account + for normalisation or likelihood + """ + Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) + mu = np.dot(np.dot(Kx.T,self.Ki),self.likelihood.Y) + KiKx = np.dot(self.Ki,Kx) + if full_cov: + Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) + var = Kxx - np.dot(KiKx.T,Kx) #NOTE this won't work for plotting + else: + Kxx = self.kern.Kdiag(_Xnew, slices=slices) + var = Kxx - np.sum(np.multiply(KiKx,Kx),0) + return mu, var[:,None] + + + def predict(self,Xnew, slices=None, full_cov=False): + """ + Predict the function(s) at the new point(s) Xnew. + + Arguments + --------- + :param Xnew: The points at which to make a prediction + :type Xnew: np.ndarray, Nnew x self.Q + :param slices: specifies which outputs kernel(s) the Xnew correspond to (see below) + :type slices: (None, list of slice objects, list of ints) + :param full_cov: whether to return the folll covariance matrix, or just the diagonal + :type full_cov: bool + :rtype: posterior mean, a Numpy array, Nnew x self.D + :rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D) + + .. Note:: "slices" specifies how the the points X_new co-vary wich the training points. + + - If None, the new points covary throigh every kernel part (default) + - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part + - If a list of booleans, specifying which kernel parts are active + + If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. + This is to allow for different normalisations of the output dimensions. + + """ + #normalise X values + Xnew = (Xnew.copy() - self._Xmean) / self._Xstd + mu, var = self._raw_predict(Xnew, slices, full_cov=full_cov) + + #now push through likelihood TODO + mean, _5pc, _95pc = self.likelihood.predictive_values(mu, var) + + return mean, _5pc, _95pc + + + def plot_internal(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): + """ + Plot the GP's view of the world, where the data is normalised and the likelihood is Gaussian + + :param samples: the number of a posteriori samples to plot + :param which_data: which if the training data to plot (default all) + :type which_data: 'all' or a slice object to slice self.X, self.Y + :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits + :param which_functions: which of the kernel functions to plot (additively) + :type which_functions: list of bools + :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D + + Plot the posterior of the GP. + - In one dimension, the function is plotted with a shaded region identifying two standard deviations. + - In two dimsensions, a contour-plot shows the mean predicted function + - In higher dimensions, we've no implemented this yet !TODO! + + Can plot only part of the data and part of the posterior functions using which_data and which_functions + Plot the data's view of the world, with non-normalised values and GP predictions passed through the likelihood + """ + if which_functions=='all': + which_functions = [True]*self.kern.Nparts + if which_data=='all': + which_data = slice(None) + + if self.X.shape[1] == 1: + Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits) + m,v = self._raw_predict(Xnew, slices=which_functions) + gpplot(Xnew,m,m-np.sqrt(v),m+np.sqrt(v)) + pb.plot(self.X[which_data],self.likelihood.Y[which_data],'kx',mew=1.5) + pb.xlim(xmin,xmax) + elif X.shape[1]==2: + resolution = resolution or 50 + Xnew, xmin, xmax,xx,yy = x_frame2D(self.X, plot_limits=plot_limits) + m,v = self._raw_predict(Xnew, slices=which_functions) + m = m.reshape(resolution,resolution) + pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) + pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) + pb.xlim(xmin[0],xmax[0]) + pb.ylim(xmin[1],xmax[1]) + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + + def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None,full_cov=False): + if which_functions=='all': + which_functions = [True]*self.kern.Nparts + if which_data=='all': + which_data = slice(None) + + if self.X.shape[1] == 1: + Xnew, xmin, xmax = x_frame1D(self.X, plot_limits=plot_limits) + m, lower, upper = self.predict(Xnew, slices=which_functions) + gpplot(Xnew,m, lower, upper) + pb.plot(self.X[which_data],self.likelihood.data[which_data],'kx',mew=1.5) + ymin,ymax = self.likelihood.data.min()*1.2,self.likelihood.data.max()*1.2 + pb.xlim(xmin,xmax) + pb.ylim(ymin,ymax) + elif X.shape[1]==2: + resolution = resolution or 50 + Xnew, xmin, xmax,xx,yy = x_frame2D(self.X, plot_limits=plot_limits) + m,v = self.predict(Xnew, slices=which_functions) + m = m.reshape(resolution,resolution) + pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) + pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) + pb.xlim(xmin[0],xmax[0]) + pb.ylim(xmin[1],xmax[1]) + else: + raise NotImplementedError, "Cannot define a frame with more than two input dimensions" + + + diff --git a/GPy/models/GP_EP.py b/GPy/models/GP_EP.py deleted file mode 100644 index 51d69d0a..00000000 --- a/GPy/models/GP_EP.py +++ /dev/null @@ -1,160 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - - -import numpy as np -import pylab as pb -from scipy import stats, linalg -from .. import kern -from ..inference.Expectation_Propagation import Full -from ..inference.likelihoods import likelihood,probit#,poisson,gaussian -from ..core import model -from ..util.linalg import pdinv,jitchol -from ..util.plot import gpplot - -class GP_EP(model): - def __init__(self,X,likelihood,kernel=None,epsilon_ep=1e-3,epsion_em=.1,powerep=[1.,1.]): - """ - Simple Gaussian Process with Non-Gaussian likelihood - - Arguments - --------- - :param X: input observations (NxD numpy.darray) - :param likelihood: a GPy likelihood (likelihood class) - :param kernel: a GPy kernel (kern class) - :param epsilon_ep: convergence criterion for the Expectation Propagation algorithm, defaults to 0.1 (float) - :param powerep: power-EP parameters [$\eta$,$\delta$], defaults to [1.,1.] (list) - :rtype: GPy model class. - """ - if kernel is None: - kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) - - assert isinstance(kernel,kern.kern), 'kernel is not a kern instance' - self.likelihood = likelihood - self.Y = self.likelihood.Y - self.kernel = kernel - self.X = X - self.N, self.D = self.X.shape - self.eta,self.delta = powerep - self.epsilon_ep = epsilon_ep - self.jitter = 1e-12 - self.K = self.kernel.K(self.X) - model.__init__(self) - - def _set_params(self,p): - self.kernel._set_params_transformed(p) - - def _get_params(self): - return self.kernel._get_params_transformed() - - def _get_param_names(self): - return self.kernel._get_param_names_transformed() - - def approximate_likelihood(self): - self.ep_approx = Full(self.K,self.likelihood,epsilon=self.epsilon_ep,powerep=[self.eta,self.delta]) - self.ep_approx.fit_EP() - - def posterior_param(self): - self.K = self.kernel.K(self.X) - self.Sroot_tilde_K = np.sqrt(self.ep_approx.tau_tilde)[:,None]*self.K - B = np.eye(self.N) + np.sqrt(self.ep_approx.tau_tilde)[None,:]*self.Sroot_tilde_K - #self.L = np.linalg.cholesky(B) - self.L = jitchol(B) - V,info = linalg.flapack.dtrtrs(self.L,self.Sroot_tilde_K,lower=1) - self.Sigma = self.K - np.dot(V.T,V) - self.mu = np.dot(self.Sigma,self.ep_approx.v_tilde) - - def log_likelihood(self): - """ - Returns - ------- - The EP approximation to the log-marginal likelihood - """ - self.posterior_param() - mu_ = self.ep_approx.v_/self.ep_approx.tau_ - L1 =.5*sum(np.log(1+self.ep_approx.tau_tilde*1./self.ep_approx.tau_))-sum(np.log(np.diag(self.L))) - L2A =.5*np.sum((self.Sigma-np.diag(1./(self.ep_approx.tau_+self.ep_approx.tau_tilde))) * np.dot(self.ep_approx.v_tilde[:,None],self.ep_approx.v_tilde[None,:])) - L2B = .5*np.dot(mu_*(self.ep_approx.tau_/(self.ep_approx.tau_tilde+self.ep_approx.tau_)),self.ep_approx.tau_tilde*mu_ - 2*self.ep_approx.v_tilde) - L3 = sum(np.log(self.ep_approx.Z_hat)) - return L1 + L2A + L2B + L3 - - def _log_likelihood_gradients(self): - dK_dp = self.kernel.dK_dtheta(self.X) - self.dK_dp = dK_dp - aux1,info_1 = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1) - b = self.ep_approx.v_tilde - np.sqrt(self.ep_approx.tau_tilde)*linalg.flapack.dtrtrs(self.L.T,aux1)[0] - U,info_u = linalg.flapack.dtrtrs(self.L,np.diag(np.sqrt(self.ep_approx.tau_tilde)),lower=1) - dL_dK = 0.5*(np.outer(b,b)-np.dot(U.T,U)) - self.dL_dK = dL_dK - return np.array([np.sum(dK_dpi*dL_dK) for dK_dpi in dK_dp.T]) - - def predict(self,X): - #TODO: check output dimensions - self.posterior_param() - K_x = self.kernel.K(self.X,X) - Kxx = self.kernel.K(X) - aux1,info = linalg.flapack.dtrtrs(self.L,np.dot(self.Sroot_tilde_K,self.ep_approx.v_tilde),lower=1) - aux2,info = linalg.flapack.dtrtrs(self.L.T, aux1,lower=0) - zeta = np.sqrt(self.ep_approx.tau_tilde)*aux2 - f = np.dot(K_x.T,self.ep_approx.v_tilde-zeta) - v,info = linalg.flapack.dtrtrs(self.L,np.sqrt(self.ep_approx.tau_tilde)[:,None]*K_x,lower=1) - variance = Kxx - np.dot(v.T,v) - vdiag = np.diag(variance) - y=self.likelihood.predictive_mean(f,vdiag) - return f,vdiag,y - - def plot(self): - """ - Plot the fitted model: training function values, inducing points used, mean estimate and confidence intervals. - """ - if self.X.shape[1]==1: - pb.figure() - xmin,xmax = self.X.min(),self.X.max() - xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) - Xnew = np.linspace(xmin,xmax,100)[:,None] - mu_f, var_f, mu_phi = self.predict(Xnew) - pb.subplot(211) - self.likelihood.plot1Da(X_new=Xnew,Mean_new=mu_f,Var_new=var_f,X_u=self.X,Mean_u=self.mu,Var_u=np.diag(self.Sigma)) - pb.subplot(212) - self.likelihood.plot1Db(self.X,Xnew,mu_phi) - elif self.X.shape[1]==2: - pb.figure() - x1min,x1max = self.X[:,0].min(0),self.X[:,0].max(0) - x2min,x2max = self.X[:,1].min(0),self.X[:,1].max(0) - x1min, x1max = x1min-0.2*(x1max-x1min), x1max+0.2*(x1max-x1min) - x2min, x2max = x2min-0.2*(x2max-x2min), x2max+0.2*(x1max-x1min) - axis1 = np.linspace(x1min,x1max,50) - axis2 = np.linspace(x2min,x2max,50) - XX1, XX2 = [e.flatten() for e in np.meshgrid(axis1,axis2)] - Xnew = np.c_[XX1.flatten(),XX2.flatten()] - f,v,p = self.predict(Xnew) - self.likelihood.plot2D(self.X,Xnew,p) - else: - raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" - - def em(self,max_f_eval=1e4,epsilon=.1,plot_all=False): #TODO check this makes sense - """ - Fits sparse_EP and optimizes the hyperparametes iteratively until convergence is achieved. - """ - self.epsilon_em = epsilon - log_likelihood_change = self.epsilon_em + 1. - self.parameters_path = [self.kernel._get_params()] - self.approximate_likelihood() - self.site_approximations_path = [[self.ep_approx.tau_tilde,self.ep_approx.v_tilde]] - self.log_likelihood_path = [self.log_likelihood()] - iteration = 0 - while log_likelihood_change > self.epsilon_em: - print 'EM iteration', iteration - self.optimize(max_f_eval = max_f_eval) - log_likelihood_new = self.log_likelihood() - log_likelihood_change = log_likelihood_new - self.log_likelihood_path[-1] - if log_likelihood_change < 0: - print 'log_likelihood decrement' - self.kernel._set_params_transformed(self.parameters_path[-1]) - self.kernM._set_params_transformed(self.parameters_path[-1]) - else: - self.approximate_likelihood() - self.log_likelihood_path.append(self.log_likelihood()) - self.parameters_path.append(self.kernel._get_params()) - self.site_approximations_path.append([self.ep_approx.tau_tilde,self.ep_approx.v_tilde]) - iteration += 1 diff --git a/GPy/models/GP_regression.py b/GPy/models/GP_regression.py index ed139ab3..5f9f9f3e 100644 --- a/GPy/models/GP_regression.py +++ b/GPy/models/GP_regression.py @@ -1,18 +1,18 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Copyright (c) 2012, James Hensman # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -import pylab as pb +from GP import GP +from .. import likelihoods from .. import kern -from ..core import model -from ..util.linalg import pdinv,mdot -from ..util.plot import gpplot, Tango -class GP_regression(model): +class GP_regression(GP): """ Gaussian Process model for regression + This is a thin wrapper around the GP class, with a set of sensible defalts + :param X: input observations :param Y: observed values :param kernel: a GPy kernel, defaults to rbf+white @@ -29,200 +29,8 @@ class GP_regression(model): def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None): if kernel is None: - kernel = kern.rbf(X.shape[1]) + kern.bias(X.shape[1]) + kern.white(X.shape[1]) + kernel = kern.rbf(X.shape[1]) - # parse arguments - self.Xslices = Xslices - assert isinstance(kernel, kern.kern) - self.kern = kernel - self.X = X - self.Y = Y - assert len(self.X.shape)==2 - assert len(self.Y.shape)==2 - assert self.X.shape[0] == self.Y.shape[0] - self.N, self.D = self.Y.shape - self.N, self.Q = self.X.shape + likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y) - #here's some simple normalisation - if normalize_X: - self._Xmean = X.mean(0)[None,:] - self._Xstd = X.std(0)[None,:] - self.X = (X.copy() - self._Xmean) / self._Xstd - if hasattr(self,'Z'): - self.Z = (self.Z - self._Xmean) / self._Xstd - else: - self._Xmean = np.zeros((1,self.X.shape[1])) - self._Xstd = np.ones((1,self.X.shape[1])) - - if normalize_Y: - self._Ymean = Y.mean(0)[None,:] - self._Ystd = Y.std(0)[None,:] - self.Y = (Y.copy()- self._Ymean) / self._Ystd - else: - self._Ymean = np.zeros((1,self.Y.shape[1])) - self._Ystd = np.ones((1,self.Y.shape[1])) - - if self.D > self.N: - # then it's more efficient to store YYT - self.YYT = np.dot(self.Y, self.Y.T) - else: - self.YYT = None - - model.__init__(self) - - def _set_params(self,p): - self.kern._set_params_transformed(p) - self.K = self.kern.K(self.X,slices1=self.Xslices) - self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) - - def _get_params(self): - return self.kern._get_params_transformed() - - def _get_param_names(self): - return self.kern._get_param_names_transformed() - - def _model_fit_term(self): - """ - Computes the model fit using YYT if it's available - """ - if self.YYT is None: - return -0.5*np.sum(np.square(np.dot(self.Li,self.Y))) - else: - return -0.5*np.sum(np.multiply(self.Ki, self.YYT)) - - def log_likelihood(self): - complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet - return complexity_term + self._model_fit_term() - - def dL_dK(self): - if self.YYT is None: - alpha = np.dot(self.Ki,self.Y) - dL_dK = 0.5*(np.dot(alpha,alpha.T)-self.D*self.Ki) - else: - dL_dK = 0.5*(mdot(self.Ki, self.YYT, self.Ki) - self.D*self.Ki) - - return dL_dK - - def _log_likelihood_gradients(self): - return self.kern.dK_dtheta(partial=self.dL_dK(),X=self.X) - - def predict(self,Xnew, slices=None, full_cov=False): - """ - - Predict the function(s) at the new point(s) Xnew. - - Arguments - --------- - :param Xnew: The points at which to make a prediction - :type Xnew: np.ndarray, Nnew x self.Q - :param slices: specifies which outputs kernel(s) the Xnew correspond to (see below) - :type slices: (None, list of slice objects, list of ints) - :param full_cov: whether to return the folll covariance matrix, or just the diagonal - :type full_cov: bool - :rtype: posterior mean, a Numpy array, Nnew x self.D - :rtype: posterior variance, a Numpy array, Nnew x Nnew x (self.D) - - .. Note:: "slices" specifies how the the points X_new co-vary wich the training points. - - - If None, the new points covary throigh every kernel part (default) - - If a list of slices, the i^th slice specifies which data are affected by the i^th kernel part - - If a list of booleans, specifying which kernel parts are active - - If full_cov and self.D > 1, the return shape of var is Nnew x Nnew x self.D. If self.D == 1, the return shape is Nnew x Nnew. - This is to allow for different normalisations of the output dimensions. - - - """ - - #normalise X values - Xnew = (Xnew.copy() - self._Xmean) / self._Xstd - mu, var = self._raw_predict(Xnew, slices, full_cov) - - #un-normalise - mu = mu*self._Ystd + self._Ymean - if full_cov: - if self.D==1: - var *= np.square(self._Ystd) - else: - var = var[:,:,None] * np.square(self._Ystd) - else: - if self.D==1: - var *= np.square(np.squeeze(self._Ystd)) - else: - var = var[:,None] * np.square(self._Ystd) - - return mu,var - - def _raw_predict(self,_Xnew,slices, full_cov=False): - """Internal helper function for making predictions, does not account for normalisation""" - Kx = self.kern.K(self.X,_Xnew, slices1=self.Xslices,slices2=slices) - mu = np.dot(np.dot(Kx.T,self.Ki),self.Y) - KiKx = np.dot(self.Ki,Kx) - if full_cov: - Kxx = self.kern.K(_Xnew, slices1=slices,slices2=slices) - var = Kxx - np.dot(KiKx.T,Kx) - else: - Kxx = self.kern.Kdiag(_Xnew, slices=slices) - var = Kxx - np.sum(np.multiply(KiKx,Kx),0) - return mu, var - - def plot(self,samples=0,plot_limits=None,which_data='all',which_functions='all',resolution=None): - """ - :param samples: the number of a posteriori samples to plot - :param which_data: which if the training data to plot (default all) - :type which_data: 'all' or a slice object to slice self.X, self.Y - :param plot_limits: The limits of the plot. If 1D [xmin,xmax], if 2D [[xmin,ymin],[xmax,ymax]]. Defaluts to data limits - :param which_functions: which of the kernel functions to plot (additively) - :type which_functions: list of bools - :param resolution: the number of intervals to sample the GP on. Defaults to 200 in 1D and 50 (a 50x50 grid) in 2D - - Plot the posterior of the GP. - - In one dimension, the function is plotted with a shaded region identifying two standard deviations. - - In two dimsensions, a contour-plot shows the mean predicted function - - In higher dimensions, we've no implemented this yet !TODO! - - Can plot only part of the data and part of the posterior functions using which_data and which_functions - """ - if which_functions=='all': - which_functions = [True]*self.kern.Nparts - if which_data=='all': - which_data = slice(None) - - X = self.X[which_data,:] - Y = self.Y[which_data,:] - - Xorig = X*self._Xstd + self._Xmean - Yorig = Y*self._Ystd + self._Ymean - if plot_limits is None: - xmin,xmax = Xorig.min(0),Xorig.max(0) - xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) - elif len(plot_limits)==2: - xmin, xmax = plot_limits - else: - raise ValueError, "Bad limits for plotting" - - - if self.X.shape[1]==1: - Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] - m,v = self.predict(Xnew,slices=which_functions) - gpplot(Xnew,m,v) - if samples: - m,v = self.predict(Xnew,slices=which_functions,full_cov=True) - s = np.random.multivariate_normal(m.flatten(),v,samples) - pb.plot(Xnew.flatten(),s.T, alpha = 0.4, c='#3465a4', linewidth = 0.8) - pb.plot(Xorig,Yorig,'kx',mew=1.5) - pb.xlim(xmin,xmax) - - elif self.X.shape[1]==2: - resolution = 50 or resolution - xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] - Xtest = np.vstack((xx.flatten(),yy.flatten())).T - zz,vv = self.predict(Xtest,slices=which_functions) - zz = zz.reshape(resolution,resolution) - pb.contour(xx,yy,zz,vmin=zz.min(),vmax=zz.max(),cmap=pb.cm.jet) - pb.scatter(Xorig[:,0],Xorig[:,1],40,Yorig,linewidth=0,cmap=pb.cm.jet,vmin=zz.min(),vmax=zz.max()) - pb.xlim(xmin[0],xmax[0]) - pb.ylim(xmin[1],xmax[1]) - - else: - raise NotImplementedError, "Cannot plot GPs with more than two input dimensions" + GP.__init__(self, X, likelihood, kernel, normalize_X=normalize_X, Xslices=Xslices) diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index 9d5b1d00..9cc8fa68 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -2,12 +2,13 @@ # Licensed under the BSD 3-clause license (see LICENSE.txt) +from GP import GP from GP_regression import GP_regression +from sparse_GP import sparse_GP from sparse_GP_regression import sparse_GP_regression from GPLVM import GPLVM from warped_GP import warpedGP -from GP_EP import GP_EP -from generalized_FITC import generalized_FITC +# TODO: from generalized_FITC import generalized_FITC from sparse_GPLVM import sparse_GPLVM -from uncollapsed_sparse_GP import uncollapsed_sparse_GP +#from uncollapsed_sparse_GP import uncollapsed_sparse_GP from BGPLVM import Bayesian_GPLVM diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py index a5ed8d0a..57ae2407 100644 --- a/GPy/models/generalized_FITC.py +++ b/GPy/models/generalized_FITC.py @@ -9,7 +9,8 @@ from .. import kern from ..core import model from ..util.linalg import pdinv,mdot from ..util.plot import gpplot -from ..inference.Expectation_Propagation import FITC +#from ..inference.Expectation_Propagation import FITC +from ..inference.EP import FITC from ..inference.likelihoods import likelihood,probit class generalized_FITC(model): diff --git a/GPy/models/sparse_GP.py b/GPy/models/sparse_GP.py new file mode 100644 index 00000000..a90f73cb --- /dev/null +++ b/GPy/models/sparse_GP.py @@ -0,0 +1,218 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, pdinv +from ..util.plot import gpplot +from .. import kern +from GP import GP + +#Still TODO: +# make use of slices properly (kernel can now do this) +# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) + +class sparse_GP(GP): + """ + Variational sparse GP model + + :param X: inputs + :type X: np.ndarray (N x Q) + :param likelihood: a likelihood instance, containing the observed data + :type likelihood: GPy.likelihood.(Gaussian | EP) + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) + :type X_uncertainty: np.ndarray (N x Q) | None + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param Zslices: slices for the inducing inputs (see slicing TODO: link) + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self, X, likelihood, kernel, Z, X_uncertainty=None, Xslices=None,Zslices=None, normalize_X=False): + self.scale_factor = 1.0# a scaling factor to help keep the algorithm stable + + self.Z = Z + self.Zslices = Zslices + self.Xslices = Xslices + self.M = Z.shape[0] + self.likelihood = likelihood + + if X_uncertainty is None: + self.has_uncertain_inputs=False + else: + assert X_uncertainty.shape==X.shape + self.has_uncertain_inputs=True + self.X_uncertainty = X_uncertainty + + GP.__init__(self, X, likelihood, kernel=kernel, normalize_X=normalize_X, Xslices=Xslices) + + #normalise X uncertainty also + if self.has_uncertain_inputs: + self.X_uncertainty /= np.square(self._Xstd) + + def _computations(self): + # TODO find routine to multiply triangular matrices + #TODO: slices for psi statistics (easy enough) + + sf = self.scale_factor + sf2 = sf**2 + + # kernel computations, using BGPLVM notation + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty) + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + if self.likelihood.is_heteroscedastic: + self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision.reshape(self.N,1,1)/sf2)).sum(0) + #TODO: what is the likelihood is heterscedatic and there are multiple independent outputs? + else: + self.psi2_beta_scaled = (self.psi2*(self.likelihood.precision/sf2)).sum(0) + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices) + self.psi1 = self.kern.K(self.Z,self.X) + if self.likelihood.is_heteroscedastic: + tmp = self.psi1*(np.sqrt(self.likelihood.precision.reshape(self.N,1))/sf) + else: + tmp = self.psi1*(np.sqrt(self.likelihood.precision)/sf) + self.psi2_beta_scaled = np.dot(tmp,tmp.T) + self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:] # TODO: remove me for efficiency and stability + + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) + + self.V = (self.likelihood.precision/self.scale_factor)*self.likelihood.Y + self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) + self.B = np.eye(self.M)/sf2 + self.A + + self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) + + self.psi1V = np.dot(self.psi1, self.V) + self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) + self.C = mdot(self.Lmi.T, self.Bi, self.Lmi) + self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T) + + # Compute dL_dpsi # FIXME: this is untested for the het. case + self.dL_dpsi0 = - 0.5 * self.D * self.likelihood.precision * np.ones(self.N) + self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T + if self.likelihood.is_heteroscedastic: + self.dL_dpsi2 = 0.5 * self.likelihood.precision[:,None,None] * self.D * self.Kmmi[None,:,:] # dB + self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]/sf2 * self.D * self.C[None,:,:] # dC + self.dL_dpsi2 += - 0.5 * self.likelihood.precision[:,None,None]* self.E[None,:,:] # dD + else: + self.dL_dpsi2 = 0.5 * self.likelihood.precision * self.D * self.Kmmi[None,:,:] # dB + self.dL_dpsi2 += - 0.5 * self.likelihood.precision/sf2 * self.D * self.C[None,:,:] # dC + self.dL_dpsi2 += - 0.5 * self.likelihood.precision * self.E[None,:,:] # dD + + # Compute dL_dKmm + self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB + self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD + + #the partial derivative vector for the likelihood + if self.likelihood.Nparams ==0: + #save computation here. + self.partial_for_likelihood = None + elif self.likelihood.is_heteroscedastic: + raise NotImplementedError, "heteroscedatic derivates not implemented" + #self.partial_for_likelihood = - 0.5 * self.D*self.likelihood.precision + 0.5 * (self.likelihood.Y**2).sum(1)*self.likelihood.precision**2 #dA + #self.partial_for_likelihood += 0.5 * self.D * (self.psi0*self.likelihood.precision**2 - (self.psi2*self.Kmmi[None,:,:]*self.likelihood.precision[:,None,None]**2).sum(1).sum(1)/sf2) #dB + #self.partial_for_likelihood += 0.5 * self.D * np.sum(self.Bi*self.A)*self.likelihood.precision #dC + #self.partial_for_likelihood += -np.diag(np.dot((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) , self.psi1VVpsi1 ))*self.likelihood.precision #dD + else: + #likelihood is not heterscedatic + beta = self.likelihood.precision + dbeta = 0.5 * self.N*self.D/beta - 0.5 * np.sum(np.square(self.likelihood.Y)) + dbeta += - 0.5 * self.D * (self.psi0.sum() - np.trace(self.A)/beta*sf2) + dbeta += - 0.5 * self.D * np.sum(self.Bi*self.A)/beta + dbeta += np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/beta + self.partial_for_likelihood = -dbeta*self.likelihood.precision**2 + + + def _set_params(self, p): + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) + self.kern._set_params(p[self.Z.size:self.Z.size+self.kern.Nparam]) + self.likelihood._set_params(p[self.Z.size+self.kern.Nparam:]) + self._computations() + + def _get_params(self): + return np.hstack([self.Z.flatten(),GP._get_params(self)]) + + def _get_param_names(self): + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + GP._get_param_names(self) + + def log_likelihood(self): + """ Compute the (lower bound on the) log marginal likelihood """ + sf2 = self.scale_factor**2 + if self.likelihood.is_heteroscedastic: + A = -0.5*self.N*self.D*np.log(2.*np.pi) +0.5*np.sum(np.log(self.likelihood.precision)) -0.5*np.sum(self.V*self.likelihood.Y) + else: + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.likelihood.precision)) -0.5*self.likelihood.precision*self.likelihood.trYYT + B = -0.5*self.D*(np.sum(self.likelihood.precision*self.psi0) - np.trace(self.A)*sf2) + C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2)) + D = +0.5*np.sum(self.psi1VVpsi1 * self.C) + return A+B+C+D + + def _log_likelihood_gradients(self): + return np.hstack((self.dL_dZ().flatten(), self.dL_dtheta(), self.likelihood._gradients(partial=self.partial_for_likelihood))) + + def dL_dtheta(self): + """ + Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel + """ + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty) + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1) + dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) + + return dL_dtheta + + def dL_dZ(self): + """ + The derivative of the bound wrt the inducing inputs Z + """ + dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ + if self.has_uncertain_inputs: + dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty) + dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes' + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1) + dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) + return dL_dZ + + def _raw_predict(self, Xnew, slices, full_cov=False): + """Internal helper function for making predictions, does not account for normalisation""" + + Kx = self.kern.K(self.Z, Xnew) + mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V) + if full_cov: + Kxx = self.kern.K(Xnew) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) #NOTE thiswon't work for plotting + else: + Kxx = self.kern.Kdiag(Xnew) + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + + return mu,var[:,None] + + def plot(self, *args, **kwargs): + """ + Plot the fitted model: just call the GP plot function and then add inducing inputs + """ + GP.plot(self,*args,**kwargs) + if self.Q==1: + pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) + if self.has_uncertain_inputs: + pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten())) + if self.Q==2: + pb.plot(self.Z[:,0],self.Z[:,1],'wo') diff --git a/GPy/models/sparse_GP_old.py b/GPy/models/sparse_GP_old.py new file mode 100644 index 00000000..7b043209 --- /dev/null +++ b/GPy/models/sparse_GP_old.py @@ -0,0 +1,258 @@ +# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +import pylab as pb +from ..util.linalg import mdot, jitchol, chol_inv, pdinv +from ..util.plot import gpplot +from .. import kern +from GP import GP + + +#Still TODO: +# make use of slices properly (kernel can now do this) +# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) + +class sparse_GP(GP): + """ + Variational sparse GP model (Regression) + + :param X: inputs + :type X: np.ndarray (N x Q) + :param Y: observed data + :type Y: np.ndarray of observations (N x D) + :param kernel : the kernel/covariance function. See link kernels + :type kernel: a GPy kernel + :param Z: inducing inputs (optional, see note) + :type Z: np.ndarray (M x Q) | None + :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) + :type X_uncertainty: np.ndarray (N x Q) | None + :param Zslices: slices for the inducing inputs (see slicing TODO: link) + :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) + :type M: int + :param beta: noise precision. TODO> ignore beta if doing EP + :type beta: float + :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) + :type normalize_(X|Y): bool + """ + + def __init__(self,X,Y=None,kernel=None,X_uncertainty=None,beta=100.,Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False,likelihood=None,method_ep='DTC',epsilon_ep=1e-3,power_ep=[1.,1.]): + + if Z is None: + self.Z = np.random.permutation(X.copy())[:M] + self.M = M + else: + assert Z.shape[1]==X.shape[1] + self.Z = Z + self.M = Z.shape[0] + if X_uncertainty is None: + self.has_uncertain_inputs=False + else: + assert X_uncertainty.shape==X.shape + self.has_uncertain_inputs=True + self.X_uncertainty = X_uncertainty + + GP.__init__(self, X=X, Y=Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y,likelihood=likelihood,epsilon_ep=epsilon_ep,power_ep=power_ep) + + #normalise X uncertainty also + if self.has_uncertain_inputs: + self.X_uncertainty /= np.square(self._Xstd) + + if not self.EP: + self.trYYT = np.sum(np.square(self.Y)) + else: + self.method_ep = method_ep + + #normalise X uncertainty also + if self.has_uncertain_inputs: + self.X_uncertainty /= np.square(self._Xstd) + + def _set_params(self, p): + self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) + if not self.EP: + self.beta = p[self.M*self.Q] + self.kern._set_params(p[self.Z.size + 1:]) + else: + self.kern._set_params(p[self.Z.size:]) + if self.Y is None: + self.Y = np.ones([self.N,1]) + self._compute_kernel_matrices() + self._computations() + + def _get_params(self): + if not self.EP: + return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()]) + else: + return np.hstack([self.Z.flatten(),self.kern._get_params_transformed()]) + + def _get_param_names(self): + if not self.EP: + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed() + else: + return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + self.kern._get_param_names_transformed() + + + def _compute_kernel_matrices(self): + # kernel computations, using BGPLVM notation + #TODO: slices for psi statistics (easy enough) + + self.Kmm = self.kern.K(self.Z) + if self.has_uncertain_inputs: + if not self.EP: + self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty)#.sum() NOTE psi0 is now a vector + self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T + self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) + #self.psi2_beta_scaled = ? + else: + raise NotImplementedError, "uncertain_inputs not yet supported for EP" + else: + self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices)#.sum() + self.psi1 = self.kern.K(self.Z,self.X) + self.psi2 = np.dot(self.psi1,self.psi1.T) + self.psi2_beta_scaled = np.dot(self.psi1,self.beta*self.psi1.T) + + def _computations(self): + # TODO find routine to multiply triangular matrices + self.V = self.beta*self.Y + self.psi1V = np.dot(self.psi1, self.V) + self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) + self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) + self.B = np.eye(self.M) + self.A + self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) + self.LLambdai = np.dot(self.LBi, self.Lmi) + self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) + self.C = mdot(self.LLambdai, self.psi1V) + self.G = mdot(self.LBL_inv, self.psi1VVpsi1, self.LBL_inv.T) + self.trace_K_beta_scaled = (self.psi0*self.beta).sum() - np.trace(self.A) + if not self.EP: + self.trace_K = self.psi0.sum() - np.trace(self.A)/self.beta + + # Compute dL_dpsi + self.dL_dpsi1 = mdot(self.LLambdai.T,self.C,self.V.T) + if not self.EP: + self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) + if self.has_uncertain_inputs: + self.dL_dpsi2 = - 0.5 * self.beta * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + else: + self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + else: + self.dL_dpsi0 = - 0.5 * self.D * self.beta.flatten() + if not self.has_uncertain_inputs: + self.dL_dpsi2_ = - 0.5 * (self.D*(self.LBL_inv - self.Kmmi) + self.G) + + # Compute dL_dKmm + self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi) # dB + self.dL_dKmm += -0.5 * self.D * (- self.LBL_inv - 2.*mdot(self.LBL_inv, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC + self.dL_dKmm += np.dot(np.dot(self.G,self.psi2_beta_scaled) - np.dot(self.LBL_inv, self.psi1VVpsi1), self.Kmmi) + 0.5*self.G # dE + + def approximate_likelihood(self): + assert not isinstance(self.likelihood, gaussian), "EP is only available for non-gaussian likelihoods" + if self.method_ep == 'DTC': + self.ep_approx = DTC(self.Kmm,self.likelihood,self.psi1,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + elif self.method_ep == 'FITC': + self.ep_approx = FITC(self.Kmm,self.likelihood,self.psi1,self.psi0,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + else: + self.ep_approx = Full(self.X,self.likelihood,self.kernel,inducing=None,epsilon=self.epsilon_ep,power_ep=[self.eta,self.delta]) + self.beta, self.Y, self.Z_ep = self.ep_approx.fit_EP() + self.trbetaYYT = np.sum(np.square(self.Y)*self.beta) + self._computations() + + def log_likelihood(self): + """ + Compute the (lower bound on the) log marginal likelihood + """ + if not self.EP: + A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) + D = -0.5*self.beta*self.trYYT + else: + A = -0.5*self.D*(self.N*np.log(2.*np.pi) - np.sum(np.log(self.beta))) + D = -0.5*self.trbetaYYT + B = -0.5*self.D*self.trace_K_beta_scaled + C = -0.5*self.D * self.B_logdet + E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) + return A+B+C+D+E + + def dL_dbeta(self): + """ + Compute the gradient of the log likelihood wrt beta. + """ + #TODO: suport heteroscedatic noise + dA_dbeta = 0.5 * self.N*self.D/self.beta + dB_dbeta = - 0.5 * self.D * self.trace_K + dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta + dD_dbeta = - 0.5 * self.trYYT + tmp = mdot(self.LBi.T, self.LLambdai, self.psi1V) + dE_dbeta = (np.sum(np.square(self.C)) - 0.5 * np.sum(self.A * np.dot(tmp, tmp.T)))/self.beta + + return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta + dE_dbeta) + + def dL_dtheta(self): + """ + Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel + """ + dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) + if self.has_uncertain_inputs: + dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) + dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1) #dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) + dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) + + return dL_dtheta + + def dL_dZ(self): + """ + The derivative of the bound wrt the inducing inputs Z + """ + dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z,)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ + if self.has_uncertain_inputs: + dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) + dL_dZ += self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) + else: + #re-cast computations in psi2 back to psi1: + dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2_,self.beta.T*self.psi1)#dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2,self.psi1) + dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) + return dL_dZ + + def _log_likelihood_gradients(self): + if not self.EP: + return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) + else: + return np.hstack([self.dL_dZ().flatten(), self.dL_dtheta()]) + + def _raw_predict(self, Xnew, slices, full_cov=False): + """Internal helper function for making predictions, does not account for normalisation""" + Kx = self.kern.K(self.Z, Xnew) + mu = mdot(Kx.T, self.LBL_inv, self.psi1V) + phi = None + if full_cov: + Kxx = self.kern.K(Xnew) + var = Kxx - mdot(Kx.T, (self.Kmmi - self.LBL_inv), Kx) + if not self.EP: + var += np.eye(Xnew.shape[0])/self.beta + else: + raise NotImplementedError, "full_cov = True not implemented for EP" + else: + Kxx = self.kern.Kdiag(Xnew) + var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.LBL_inv, Kx),0) + if not self.EP: + var += 1./self.beta + else: + phi = self.likelihood.predictive_mean(mu,var) + return mu,var,phi + + def plot(self, *args, **kwargs): + """ + Plot the fitted model: just call the GP_regression plot function and then add inducing inputs + """ + GP.plot(self,*args,**kwargs) + if self.Q==1: + pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) + if self.has_uncertain_inputs: + pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten())) + if self.Q==2: + pb.plot(self.Z[:,0],self.Z[:,1],'wo') diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py index 07ce4d97..178c8023 100644 --- a/GPy/models/sparse_GP_regression.py +++ b/GPy/models/sparse_GP_regression.py @@ -1,205 +1,44 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). +# Copyright (c) 2012, James Hensman # Licensed under the BSD 3-clause license (see LICENSE.txt) + import numpy as np -import pylab as pb -from ..util.linalg import mdot, jitchol, chol_inv, pdinv -from ..util.plot import gpplot +from sparse_GP import sparse_GP +from .. import likelihoods from .. import kern -from ..inference.likelihoods import likelihood -from GP_regression import GP_regression -#Still TODO: -# make use of slices properly (kernel can now do this) -# enable heteroscedatic noise (kernel will need to compute psi2 as a (NxMxM) array) - -class sparse_GP_regression(GP_regression): +class sparse_GP_regression(sparse_GP): """ - Variational sparse GP model (Regression) + Gaussian Process model for regression + + This is a thin wrapper around the GP class, with a set of sensible defalts + + :param X: input observations + :param Y: observed values + :param kernel: a GPy kernel, defaults to rbf+white + :param normalize_X: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_X: False|True + :param normalize_Y: whether to normalize the input data before computing (predictions will be in original scales) + :type normalize_Y: False|True + :param Xslices: how the X,Y data co-vary in the kernel (i.e. which "outputs" they correspond to). See (link:slicing) + :rtype: model object + + .. Note:: Multiple independent outputs are allowed using columns of Y - :param X: inputs - :type X: np.ndarray (N x Q) - :param Y: observed data - :type Y: np.ndarray of observations (N x D) - :param kernel : the kernel/covariance function. See link kernels - :type kernel: a GPy kernel - :param Z: inducing inputs (optional, see note) - :type Z: np.ndarray (M x Q) | None - :param X_uncertainty: The uncertainty in the measurements of X (Gaussian variance) - :type X_uncertainty: np.ndarray (N x Q) | None - :param Zslices: slices for the inducing inputs (see slicing TODO: link) - :param M : Number of inducing points (optional, default 10. Ignored if Z is not None) - :type M: int - :param beta: noise precision. TODO> ignore beta if doing EP - :type beta: float - :param normalize_(X|Y) : whether to normalize the data before computing (predictions will be in original scales) - :type normalize_(X|Y): bool """ - def __init__(self,X,Y,kernel=None, X_uncertainty=None, beta=100., Z=None,Zslices=None,M=10,normalize_X=False,normalize_Y=False): - self.scale_factor = 1000.0 - self.beta = beta + def __init__(self,X,Y,kernel=None,normalize_X=False,normalize_Y=False, Xslices=None,Z=None, M=10): + #kern defaults to rbf + if kernel is None: + kernel = kern.rbf(X.shape[1]) + kern.white(X.shape[1],1e-3) + + #Z defaults to a subset of the data if Z is None: - self.Z = np.random.permutation(X.copy())[:M] - self.M = M + Z = np.random.permutation(X.copy())[:M] else: assert Z.shape[1]==X.shape[1] - self.Z = Z - self.M = Z.shape[0] - if X_uncertainty is None: - self.has_uncertain_inputs=False - else: - assert X_uncertainty.shape==X.shape - self.has_uncertain_inputs=True - self.X_uncertainty = X_uncertainty - GP_regression.__init__(self, X, Y, kernel=kernel, normalize_X=normalize_X, normalize_Y=normalize_Y) - self.trYYT = np.sum(np.square(self.Y)) + #likelihood defaults to Gaussian + likelihood = likelihoods.Gaussian(Y,normalize=normalize_Y) - #normalise X uncertainty also - if self.has_uncertain_inputs: - self.X_uncertainty /= np.square(self._Xstd) - - def _computations(self): - # TODO find routine to multiply triangular matrices - #TODO: slices for psi statistics (easy enough) - - # kernel computations, using BGPLVM notation - self.Kmm = self.kern.K(self.Z) - if self.has_uncertain_inputs: - self.psi0 = self.kern.psi0(self.Z,self.X, self.X_uncertainty).sum() - self.psi1 = self.kern.psi1(self.Z,self.X, self.X_uncertainty).T - self.psi2 = self.kern.psi2(self.Z,self.X, self.X_uncertainty) - self.psi2_beta_scaled = (self.psi2*(self.beta/self.scale_factor**2)).sum(0) - else: - self.psi0 = self.kern.Kdiag(self.X,slices=self.Xslices).sum() - self.psi1 = self.kern.K(self.Z,self.X) - #self.psi2 = np.dot(self.psi1,self.psi1.T) - #self.psi2 = self.psi1.T[:,:,None]*self.psi1.T[:,None,:] - tmp = self.psi1/(self.scale_factor/np.sqrt(self.beta)) - self.psi2_beta_scaled = np.dot(tmp,tmp.T) - - sf = self.scale_factor - sf2 = sf**2 - - self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)#+np.eye(self.M)*1e-3) - - self.V = (self.beta/self.scale_factor)*self.Y - self.A = mdot(self.Lmi, self.psi2_beta_scaled, self.Lmi.T) - self.B = np.eye(self.M)/sf2 + self.A - - self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) - - self.psi1V = np.dot(self.psi1, self.V) - self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) - self.C = mdot(self.Lmi.T, self.Bi, self.Lmi) - self.E = mdot(self.C, self.psi1VVpsi1/sf2, self.C.T) - - # Compute dL_dpsi - self.dL_dpsi0 = - 0.5 * self.D * self.beta * np.ones(self.N) - self.dL_dpsi1 = mdot(self.V, self.psi1V.T,self.C).T - self.dL_dpsi2 = 0.5 * self.beta * self.D * self.Kmmi[None,:,:] # dB - self.dL_dpsi2 += - 0.5 * self.beta/sf2 * self.D * self.C[None,:,:] # dC - self.dL_dpsi2 += - 0.5 * self.beta * self.E[None,:,:] # dD - - # Compute dL_dKmm - self.dL_dKmm = -0.5 * self.D * mdot(self.Lmi.T, self.A, self.Lmi)*sf2 # dB - self.dL_dKmm += -0.5 * self.D * (- self.C/sf2 - 2.*mdot(self.C, self.psi2_beta_scaled, self.Kmmi) + self.Kmmi) # dC - self.dL_dKmm += np.dot(np.dot(self.E*sf2, self.psi2_beta_scaled) - np.dot(self.C, self.psi1VVpsi1), self.Kmmi) + 0.5*self.E # dD - - - def _set_params(self, p): - self.Z = p[:self.M*self.Q].reshape(self.M, self.Q) - self.beta = p[self.M*self.Q] - self.kern._set_params(p[self.Z.size + 1:]) - self._computations() - - def _get_params(self): - return np.hstack([self.Z.flatten(),self.beta,self.kern._get_params_transformed()]) - - def _get_param_names(self): - return sum([['iip_%i_%i'%(i,j) for i in range(self.Z.shape[0])] for j in range(self.Z.shape[1])],[]) + ['noise_precision']+self.kern._get_param_names_transformed() - - - def log_likelihood(self): - """ Compute the (lower bound on the) log marginal likelihood """ - sf2 = self.scale_factor**2 - A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) -0.5*self.beta*self.trYYT - B = -0.5*self.D*(self.beta*self.psi0-np.trace(self.A)*sf2) - C = -0.5*self.D * (self.B_logdet + self.M*np.log(sf2)) - D = +0.5*np.sum(self.psi1VVpsi1 * self.C) - return A+B+C+D - - def _log_likelihood_gradients(self): - return np.hstack([self.dL_dZ().flatten(), self.dL_dbeta(), self.dL_dtheta()]) - - def dL_dbeta(self): - """ - Compute the gradient of the log likelihood wrt beta. - """ - #TODO: suport heteroscedatic noise - sf2 = self.scale_factor**2 - dA_dbeta = 0.5 * self.N*self.D/self.beta - 0.5 * self.trYYT - dB_dbeta = - 0.5 * self.D * (self.psi0 - np.trace(self.A)/self.beta*sf2) - dC_dbeta = - 0.5 * self.D * np.sum(self.Bi*self.A)/self.beta - dD_dbeta = np.sum((self.C - 0.5 * mdot(self.C,self.psi2_beta_scaled,self.C) ) * self.psi1VVpsi1 )/self.beta - - return np.squeeze(dA_dbeta + dB_dbeta + dC_dbeta + dD_dbeta) - - def dL_dtheta(self): - """ - Compute and return the derivative of the log marginal likelihood wrt the parameters of the kernel - """ - dL_dtheta = self.kern.dK_dtheta(self.dL_dKmm,self.Z) - if self.has_uncertain_inputs: - dL_dtheta += self.kern.dpsi0_dtheta(self.dL_dpsi0, self.Z,self.X,self.X_uncertainty) - dL_dtheta += self.kern.dpsi1_dtheta(self.dL_dpsi1.T,self.Z,self.X, self.X_uncertainty) - dL_dtheta += self.kern.dpsi2_dtheta(self.dL_dpsi2,self.dL_dpsi1.T, self.Z,self.X, self.X_uncertainty) # for multiple_beta, dL_dpsi2 will be a different shape - else: - #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1) - dL_dtheta += self.kern.dK_dtheta(dL_dpsi1,self.Z,self.X) - dL_dtheta += self.kern.dKdiag_dtheta(self.dL_dpsi0, self.X) - - return dL_dtheta - - def dL_dZ(self): - """ - The derivative of the bound wrt the inducing inputs Z - """ - dL_dZ = 2.*self.kern.dK_dX(self.dL_dKmm,self.Z)#factor of two becase of vertical and horizontal 'stripes' in dKmm_dZ - if self.has_uncertain_inputs: - dL_dZ += self.kern.dpsi1_dZ(self.dL_dpsi1,self.Z,self.X, self.X_uncertainty) - dL_dZ += 2.*self.kern.dpsi2_dZ(self.dL_dpsi2,self.Z,self.X, self.X_uncertainty) # 'stripes' - else: - #re-cast computations in psi2 back to psi1: - dL_dpsi1 = self.dL_dpsi1 + 2.*np.dot(self.dL_dpsi2.sum(0),self.psi1) - dL_dZ += self.kern.dK_dX(dL_dpsi1,self.Z,self.X) - return dL_dZ - - def _raw_predict(self, Xnew, slices, full_cov=False): - """Internal helper function for making predictions, does not account for normalisation""" - - Kx = self.kern.K(self.Z, Xnew) - mu = mdot(Kx.T, self.C/self.scale_factor, self.psi1V) - - if full_cov: - Kxx = self.kern.K(Xnew) - var = Kxx - mdot(Kx.T, (self.Kmmi - self.C/self.scale_factor**2), Kx) + np.eye(Xnew.shape[0])/self.beta # TODO: This beta doesn't belong here in the EP case. - else: - Kxx = self.kern.Kdiag(Xnew) - var = Kxx - np.sum(Kx*np.dot(self.Kmmi - self.C/self.scale_factor**2, Kx),0) + 1./self.beta # TODO: This beta doesn't belong here in the EP case. - - return mu,var - - def plot(self, *args, **kwargs): - """ - Plot the fitted model: just call the GP_regression plot function and then add inducing inputs - """ - GP_regression.plot(self,*args,**kwargs) - if self.Q==1: - pb.plot(self.Z,self.Z*0+pb.ylim()[0],'k|',mew=1.5,markersize=12) - if self.has_uncertain_inputs: - pb.errorbar(self.X[:,0], pb.ylim()[0]+np.zeros(self.N), xerr=2*np.sqrt(self.X_uncertainty.flatten())) - if self.Q==2: - pb.plot(self.Z[:,0],self.Z[:,1],'wo') + sparse_GP.__init__(self, X, likelihood, kernel, Z, normalize_X=normalize_X, Xslices=Xslices) diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py index a302b25f..4cc1c4ab 100644 --- a/GPy/testing/unit_tests.py +++ b/GPy/testing/unit_tests.py @@ -154,17 +154,16 @@ class GradientTests(unittest.TestCase): m.constrain_positive('(linear|bias|white)') self.assertTrue(m.checkgrad()) - def test_GP_EP(self): - return # Disabled TODO + def test_GP_EP_probit(self): N = 20 - X = np.hstack([np.random.rand(N/2)+1,np.random.rand(N/2)-1])[:,None] - k = GPy.kern.rbf(1) + GPy.kern.white(1) - Y = np.hstack([np.ones(N/2),-np.ones(N/2)])[:,None] - likelihood = GPy.inference.likelihoods.probit(Y) - m = GPy.models.GP_EP(X,likelihood,k) - m.constrain_positive('(var|len)') - m.approximate_likelihood() - self.assertTrue(m.checkgrad()) + X = np.hstack([np.random.normal(5,2,N/2),np.random.normal(10,2,N/2)])[:,None] + Y = np.hstack([np.ones(N/2),np.repeat(-1,N/2)])[:,None] + kernel = GPy.kern.rbf(1) + distribution = GPy.likelihoods.likelihood_functions.probit() + likelihood = GPy.likelihoods.EP(Y,distribution) + m = GPy.models.GP(X,kernel,likelihood=likelihood) + m.ensure_default_constraints() + self.assertTrue(m.EPEM) @unittest.skip("FITC will be broken for a while") def test_generalized_FITC(self): diff --git a/GPy/util/plot.py b/GPy/util/plot.py index 8c06633e..7b346330 100644 --- a/GPy/util/plot.py +++ b/GPy/util/plot.py @@ -6,30 +6,26 @@ import Tango import pylab as pb import numpy as np -def gpplot(x,mu,var,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.coloursHex['lightBlue'],axes=None,**kwargs): +def gpplot(x,mu,lower,upper,edgecol=Tango.coloursHex['darkBlue'],fillcol=Tango.coloursHex['lightBlue'],axes=None,**kwargs): if axes is None: axes = pb.gca() mu = mu.flatten() x = x.flatten() + lower = lower.flatten() + upper = upper.flatten() #here's the mean axes.plot(x,mu,color=edgecol,linewidth=2) - #ensure variance is a vector - if len(var.shape)>1: - err = 2*np.sqrt(np.diag(var)) - else: - err = 2*np.sqrt(var) - - #here's the 2*std box + #here's the box kwargs['linewidth']=0.5 if not 'alpha' in kwargs.keys(): kwargs['alpha'] = 0.3 - axes.fill(np.hstack((x,x[::-1])),np.hstack((mu+err,mu[::-1]-err[::-1])),color=fillcol,**kwargs) + axes.fill(np.hstack((x,x[::-1])),np.hstack((upper,lower[::-1])),color=fillcol,**kwargs) #this is the edge: - axes.plot(x,mu+err,color=edgecol,linewidth=0.2) - axes.plot(x,mu-err,color=edgecol,linewidth=0.2) + axes.plot(x,upper,color=edgecol,linewidth=0.2) + axes.plot(x,lower,color=edgecol,linewidth=0.2) def removeRightTicks(ax=None): ax = ax or pb.gca() @@ -74,4 +70,37 @@ def align_subplots(N,M,xlim=None, ylim=None): else: removeUpperTicks() +def x_frame1D(X,plot_limits=None,resolution=None): + """ + Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits + """ + assert X.shape[1] ==1, "x_frame1D is defined for one-dimensional inputs" + if plot_limits is None: + xmin,xmax = X.min(0),X.max(0) + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + elif len(plot_limits)==2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + + Xnew = np.linspace(xmin,xmax,resolution or 200)[:,None] + return Xnew, xmin, xmax + +def x_frame2D(X,plot_limits=None,resolution=None): + """ + Internal helper function for making plots, returns a set of input values to plot as well as lower and upper limits + """ + assert X.shape[1] ==2, "x_frame2D is defined for two-dimensional inputs" + if plot_limits is None: + xmin,xmax = X.min(0),X.max(0) + xmin, xmax = xmin-0.2*(xmax-xmin), xmax+0.2*(xmax-xmin) + elif len(plot_limits)==2: + xmin, xmax = plot_limits + else: + raise ValueError, "Bad limits for plotting" + + resolution = resolution or 50 + xx,yy = np.mgrid[xmin[0]:xmax[0]:1j*resolution,xmin[1]:xmax[1]:1j*resolution] + Xnew = np.vstack((xx.flatten(),yy.flatten())).T + return Xnew, xx,yy,xmin, xmax