diff --git a/GPy/__init__.py b/GPy/__init__.py index 876e2ca6..6993d5c2 100644 --- a/GPy/__init__.py +++ b/GPy/__init__.py @@ -6,5 +6,5 @@ import kern import models import inference import util -import examples +#import examples TODO: discuss! from core import priors diff --git a/GPy/core/model.py b/GPy/core/model.py index 0db18061..44d57b42 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -162,6 +162,20 @@ class model(parameterised): else: self.expand_param(initial_parameters) + def ensure_default_constraints(self,warn=False): + """ + Ensure that any variables which should clearly be positive have been constrained somehow. + """ + positive_strings = ['variance','lengthscale', 'precision'] + for s in positive_strings: + for i in self.grep_param_names(s): + if not (i in self.all_constrained_indices()): + name = self.get_param_names()[i] + self.constrain_positive(name) + if warn: + print "Warning! constraining %s postive"%name + + def optimize(self, optimizer=None, start=None, **kwargs): """ Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors. diff --git a/GPy/inference/Expectation_Propagation.py b/GPy/inference/Expectation_Propagation.py index 377b1963..05453f1d 100644 --- a/GPy/inference/Expectation_Propagation.py +++ b/GPy/inference/Expectation_Propagation.py @@ -11,45 +11,21 @@ from ..util.linalg import pdinv,mdot,jitchol from ..util.plot import gpplot from .. import kern -class EP: - def __init__(self,covariance,likelihood,Kmn=None,Knn_diag=None,epsilon=1e-3,powerep=[1.,1.]): - """ - Expectation Propagation +class EP_base: + """ + Expectation Propagation. - Arguments - --------- - X : input observations - likelihood : Output's likelihood (likelihood class) - kernel : a GPy kernel (kern class) - inducing : Either an array specifying the inducing points location or a sacalar defining their number. None value for using a non-sparse model is used. - powerep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats) - epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) - """ + This is just the base class for expectation propagation. We'll extend it for full and sparse EP. + """ + def __init__(self,likelihood,epsilon=1e-3,powerep=[1.,1.]): self.likelihood = likelihood - assert covariance.shape[0] == covariance.shape[1] - if Kmn is not None: - self.Kmm = covariance - self.Kmn = Kmn - self.M = self.Kmn.shape[0] - self.N = self.Kmn.shape[1] - assert self.M < self.N, 'The number of inducing inputs must be smaller than the number of observations' - else: - self.K = covariance - self.N = self.K.shape[0] - if Knn_diag is not None: - self.Knn_diag = Knn_diag - assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N' - self.epsilon = epsilon self.eta, self.delta = powerep self.jitter = 1e-12 - """ - Initial values - Likelihood approximation parameters: - p(y|f) = t(f|tau_tilde,v_tilde) - """ - self.tau_tilde = np.zeros(self.N) - self.v_tilde = np.zeros(self.N) + #Initial values - Likelihood approximation parameters: + #p(y|f) = t(f|tau_tilde,v_tilde) + self.restart_EP() def restart_EP(self): """ @@ -59,8 +35,23 @@ class EP: self.v_tilde = np.zeros(self.N) self.mu = np.zeros(self.N) -class Full(EP): - def fit_EP(self): +class Full(EP_base): + """ + :param likelihood: Output's likelihood (e.g. probit) + :type likelihood: GPy.inference.likelihood instance + :param K: prior covariance matrix + :type K: np.ndarray (N x N) + :param likelihood: Output's likelihood (e.g. probit) + :type likelihood: GPy.inference.likelihood instance + :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) + :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats) + """ + def __init__(self,K,likelihood,*args,**kwargs): + assert K.shape[0] == K.shape[1] + self.K = K + self.N = self.K.shape[0] + EP_base.__init__(self,likelihood,*args,**kwargs) + def fit_EP(self,messages=False): """ The expectation-propagation algorithm. For nomenclature see Rasmussen & Williams 2006 (pag. 52-60) @@ -78,15 +69,16 @@ class Full(EP): sigma_ = 1./tau_ mu_ = v_/tau_ """ - self.tau_ = np.empty(self.N,dtype=float) - self.v_ = np.empty(self.N,dtype=float) + + self.tau_ = np.empty(self.N,dtype=np.float64) + self.v_ = np.empty(self.N,dtype=np.float64) #Initial values - Marginal moments - z = np.empty(self.N,dtype=float) - self.Z_hat = np.empty(self.N,dtype=float) - phi = np.empty(self.N,dtype=float) - mu_hat = np.empty(self.N,dtype=float) - sigma2_hat = np.empty(self.N,dtype=float) + z = np.empty(self.N,dtype=np.float64) + self.Z_hat = np.empty(self.N,dtype=np.float64) + phi = np.empty(self.N,dtype=np.float64) + mu_hat = np.empty(self.N,dtype=np.float64) + sigma2_hat = np.empty(self.N,dtype=np.float64) #Approximation epsilon_np1 = self.epsilon + 1. @@ -95,8 +87,7 @@ class Full(EP): self.np1 = [self.tau_tilde.copy()] self.np2 = [self.v_tilde.copy()] while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon: - update_order = np.arange(self.N) - random.shuffle(update_order) + update_order = np.random.permutation(self.N) for i in update_order: #Cavity distribution parameters self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i] @@ -120,14 +111,35 @@ class Full(EP): V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1) self.Sigma = self.K - np.dot(V.T,V) self.mu = np.dot(self.Sigma,self.v_tilde) - epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N - epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N + epsilon_np1 = np.mean(self.tau_tilde-self.np1[-1]**2) + epsilon_np2 = np.mean(self.v_tilde-self.np2[-1]**2) self.np1.append(self.tau_tilde.copy()) self.np2.append(self.v_tilde.copy()) + if messages: + print "EP iteration %i, epsiolon %d"%(self.iterations,epsilon_np1) - self.np2.append(self.v_tilde.copy()) +class FITC(EP_base): + """ + :param likelihood: Output's likelihood (e.g. probit) + :type likelihood: GPy.inference.likelihood instance + :param Knn_diag: The diagonal elements of Knn is a 1D vector + :param Kmn: The 'cross' variance between inducing inputs and data + :param Kmm: the covariance matrix of the inducing inputs + :param likelihood: Output's likelihood (e.g. probit) + :type likelihood: GPy.inference.likelihood instance + :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float) + :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats) + """ + def __init__(self,likelihood,Knn_diag,Kmn,Kmm,*args,**kwargs): + self.Knn_diag = Knn_diag + self.Kmn = Kmn + self.Kmm = Kmm + self.M = self.Kmn.shape[0] + self.N = self.Kmn.shape[1] + assert self.M <= self.N, 'The number of inducing inputs must be smaller than the number of observations' + assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N' + EP_base.__init__(self,likelihood,*args,**kwargs) -class FITC(EP): def fit_EP(self): """ The expectation-propagation algorithm with sparse pseudo-input. @@ -166,15 +178,15 @@ class FITC(EP): sigma_ = 1./tau_ mu_ = v_/tau_ """ - self.tau_ = np.empty(self.N,dtype=float) - self.v_ = np.empty(self.N,dtype=float) + self.tau_ = np.empty(self.N,dtype=np.float64) + self.v_ = np.empty(self.N,dtype=np.float64) #Initial values - Marginal moments - z = np.empty(self.N,dtype=float) - self.Z_hat = np.empty(self.N,dtype=float) - phi = np.empty(self.N,dtype=float) - mu_hat = np.empty(self.N,dtype=float) - sigma2_hat = np.empty(self.N,dtype=float) + z = np.empty(self.N,dtype=np.float64) + self.Z_hat = np.empty(self.N,dtype=np.float64) + phi = np.empty(self.N,dtype=np.float64) + mu_hat = np.empty(self.N,dtype=np.float64) + sigma2_hat = np.empty(self.N,dtype=np.float64) #Approximation epsilon_np1 = 1 diff --git a/GPy/kern/finite_dimensional.py b/GPy/kern/finite_dimensional.py index 421a3c5b..98c99628 100644 --- a/GPy/kern/finite_dimensional.py +++ b/GPy/kern/finite_dimensional.py @@ -19,7 +19,7 @@ class finite_dimensional(kernpart): self.D = D self.F = F self.G = G - self.G_1 ,tmp = pdinv(G) + self.G_1 ,L,Li,logdet = pdinv(G) self.n = F.shape[0] if weights is not None: assert weights.shape==(self.n,) diff --git a/GPy/kern/kern.py b/GPy/kern/kern.py index 158913ff..3830d844 100644 --- a/GPy/kern/kern.py +++ b/GPy/kern/kern.py @@ -99,6 +99,7 @@ class kern(parameterised): newkern.constrained_fixed_values = self.constrained_fixed_values + other.constrained_fixed_values newkern.tied_indices = self.tied_indices + [self.Nparam + x for x in other.tied_indices] return newkern + def add(self,other): """ Add another kernel to this one. Both kernels are defined on the same _space_ @@ -139,7 +140,13 @@ class kern(parameterised): [p.set_param(x[s]) for p, s in zip(self.parts, self.param_slices)] def get_param_names(self): - return sum([[k.name+'_'+str(i)+'_'+n for n in k.get_param_names()] for i,k in enumerate(self.parts)],[]) + #this is a bit nasty: we wat to distinguish between parts with the same name by appending a count + part_names = np.array([k.name for k in self.parts],dtype=np.str) + counts = [np.sum(part_names==ni) for i, ni in enumerate(part_names)] + cum_counts = [np.sum(part_names[i:]==ni) for i, ni in enumerate(part_names)] + names = [name+'_'+str(cum_count) if count>1 else name for name,count,cum_count in zip(part_names,counts,cum_counts)] + + return sum([[name+'_'+n for n in k.get_param_names()] for name,k in zip(names,self.parts)],[]) def K(self,X,X2=None,slices1=None,slices2=None): assert X.shape[1]==self.D diff --git a/GPy/models/GP_EP.py b/GPy/models/GP_EP.py index 5dc721a4..bb582674 100644 --- a/GPy/models/GP_EP.py +++ b/GPy/models/GP_EP.py @@ -6,7 +6,7 @@ import numpy as np import pylab as pb from scipy import stats, linalg from .. import kern -from ..inference.Expectation_Propagation import EP,Full +from ..inference.Expectation_Propagation import Full from ..inference.likelihoods import likelihood,probit#,poisson,gaussian from ..core import model from ..util.linalg import pdinv,jitchol diff --git a/GPy/models/GP_regression.py b/GPy/models/GP_regression.py index 885f13cc..f21c4591 100644 --- a/GPy/models/GP_regression.py +++ b/GPy/models/GP_regression.py @@ -71,7 +71,7 @@ class GP_regression(model): def set_param(self,p): self.kern.expand_param(p) self.K = self.kern.K(self.X,slices1=self.Xslices) - self.Ki,self.hld = pdinv(self.K) + self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K) def get_param(self): return self.kern.extract_param() @@ -90,7 +90,7 @@ class GP_regression(model): return -0.5*np.sum(np.multiply(self.Ki, self.Youter)) def log_likelihood(self): - complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - self.D*self.hld + complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet return complexity_term + self._model_fit_term() def dL_dK(self): diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index 00ead0ba..dd721559 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -9,4 +9,3 @@ from warped_GP import warpedGP from GP_EP import GP_EP from generalized_FITC import generalized_FITC from sparse_GPLVM import sparse_GPLVM - diff --git a/GPy/models/generalized_FITC.py b/GPy/models/generalized_FITC.py index b4953ecb..4ef36f0e 100644 --- a/GPy/models/generalized_FITC.py +++ b/GPy/models/generalized_FITC.py @@ -9,7 +9,7 @@ from .. import kern from ..core import model from ..util.linalg import pdinv,mdot from ..util.plot import gpplot -from ..inference.Expectation_Propagation import EP,Full,FITC +from ..inference.Expectation_Propagation import FITC from ..inference.likelihoods import likelihood,probit class generalized_FITC(model): @@ -62,7 +62,7 @@ class generalized_FITC(model): def posterior_param(self): self.Knn_diag = self.kernel.Kdiag(self.X) self.Kmm = self.kernel.K(self.Z) - self.Kmmi, self.Kmm_hld = pdinv(self.Kmm) + self.Kmmi, self.Lmm, self.Lmmi, self.Kmm_logdet = pdinv(self.Kmm) self.Knm = self.kernel.K(self.X,self.Z) self.KmmiKmn = np.dot(self.Kmmi,self.Knm.T) self.Qnn = np.dot(self.Knm,self.KmmiKmn) @@ -72,10 +72,10 @@ class generalized_FITC(model): self.Taut = self.ep_approx.tau_tilde/(1.+ self.ep_approx.tau_tilde*self.Diag0) self.KmnTaut = self.Knm.T*self.Taut[None,:] self.KmnTautKnm = np.dot(self.KmnTaut, self.Knm) - self.Woodbury_inv, self.Woodbury_hld = pdinv(self.Kmm + self.KmnTautKnm) + self.Woodbury_inv, self.Wood_L, self.Wood_Li, self.Woodbury_logdet = pdinv(self.Kmm + self.KmnTautKnm) self.Qnn_diag = self.Knn_diag - np.diag(self.Qnn) + 1./self.ep_approx.tau_tilde self.Qi = -np.dot(self.KmnTaut.T, np.dot(self.Woodbury_inv,self.KmnTaut)) + np.diag(self.Taut) - self.hld = 0.5*np.sum(np.log(self.Diag0 + 1./self.ep_approx.tau_tilde)) - self.Kmm_hld + self.Woodbury_hld + self.hld = 0.5*np.sum(np.log(self.Diag0 + 1./self.ep_approx.tau_tilde)) - 0.5*self.Kmm_logdet + 0.5*self.Woodbury_logdet self.Diag = self.Diag0/(1.+ self.Diag0 * self.ep_approx.tau_tilde) self.P = (self.Diag / self.Diag0)[:,None] * self.Knm diff --git a/GPy/models/sparse_GP_regression.py b/GPy/models/sparse_GP_regression.py index 38aeef08..da19d80a 100644 --- a/GPy/models/sparse_GP_regression.py +++ b/GPy/models/sparse_GP_regression.py @@ -87,14 +87,10 @@ class sparse_GP_regression(GP_regression): self.V = self.beta*self.Y self.psi1V = np.dot(self.psi1, self.V) self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T) - self.Lm = jitchol(self.Kmm) - self.Lmi = chol_inv(self.Lm) - self.Kmmi = np.dot(self.Lmi.T, self.Lmi) + self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm) self.A = mdot(self.Lmi, self.psi2, self.Lmi.T) self.B = np.eye(self.M) + self.beta * self.A - self.LB = jitchol(self.B) - self.LBi = chol_inv(self.LB) - self.Bi = np.dot(self.LBi.T, self.LBi) + self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B) self.LLambdai = np.dot(self.LBi, self.Lmi) self.trace_K = self.psi0 - np.trace(self.A) self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi) @@ -124,7 +120,7 @@ class sparse_GP_regression(GP_regression): """ A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta)) B = -0.5*self.beta*self.D*self.trace_K - C = -self.D * np.sum(np.log(np.diag(self.LB))) + C = -0.5*self.D * self.B_logdet D = -0.5*self.beta*self.trYYT E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv) return A+B+C+D+E diff --git a/GPy/models/uncollapsed_sparse_GP.py b/GPy/models/uncollapsed_sparse_GP.py index b5d4b054..89a8ff0e 100644 --- a/GPy/models/uncollapsed_sparse_GP.py +++ b/GPy/models/uncollapsed_sparse_GP.py @@ -39,7 +39,8 @@ class uncollapsed_sparse_GP(sparse_GP_regression): M = Z.shape[0] else: M=M - self.set_vb_param(np.hstack((np.ones(M*D)),np.eye(M).flatten())) + q_u = np.hstack((np.ones(M*D)),np.eye(M).flatten()) + self.set_vb_param(q_u) sparse_GP_regression.__init__(self, X, Y, *args, **kwargs) def _computations(self): diff --git a/GPy/notes.txt b/GPy/notes.txt new file mode 100644 index 00000000..c09102b9 --- /dev/null +++ b/GPy/notes.txt @@ -0,0 +1,42 @@ +Fails in weird ways if you pass a integer as the input instead of a double to the kernel. + +The Matern kernels (at least the 52) still is working in the ARD manner which means it wouldn't run for very large input dimension. Needs to be fixed to match the RBF. + +Implementing new covariances is too complicated at the moment. We need a barebones example of what to implement and where. Commenting in the covariance matrices needs to be improved. It's not clear to a user what all the psi parts are for. Maybe we need a cut down and simplified example to help with this (perhaps a cut down version of the RBF?). And then we should provide a simple list of what you need to do to get a new kernel going. + +Missing kernels: polynomial, rational quadratic. + +Kernel implementations are far to obscure. Need to be easily readable for a first time user. + +Need an implementation of scaled conjugate gradients for the optimizers. + +Need an implementation of gradient descent for the optimizers (works well with GP-LVM for small random initializations) + +Need Carl Rasmussen's permission to add his conjugate gradients algorithm. In fact, we can just provide a hook for it, and post a separate python implementation of his algorithm. + +Change get_param and set_param to get_params and set_params + +Get constrain param by default inside model creation. + +Randomize doesn't seem to cover a wide enough range for restarts ... try it for a model where inputs are widely spaced apart and length scale is too short. Sampling from N(0,1) is too conservative. Dangerous for people who naively use restarts. Since we have the model we could maybe come up with some sensible heuristics for setting these things. Maybe we should also consider having '.initialize()'. If we can't do this well we should disable the restart method. + + +Tolerances for optimizers, do we need to introduce some standardization? At the moment does each have its own defaults? + +Do all optimizers work only in terms of function evaluations? Do we need to check for one that uses iterations? + +Change Youter to YYT (Youter doesn't mean anything for matrices). + +Bug when running classification.crescent_data() + +A dictionary for parameter storage? So we can go through names easily? + +A flag on covariance functions that indicates when they are not associated with an underlying function (like white noise or a coregionalization matrix). + +Diagonal noise covariance function + +Long term: automatic Lagrange multiplier calculation for optimizers: constrain two parameters in an unusual way and the model automatically does the Lagrangian. Also augment the parameters with new ones, so define data variance to be white noise plus RBF variance and optimize over that and signal to noise ratio ... for example constrain the sum of variances to equal the known variance of the data. + +When computing kernel.K for kernels like rbf, you can't compute a version with rbf.K(X) you have to do rbf.K(X, X) + +the predict method for GP_regression returns a covariance matrix which is a bad idea as this takes a lot to compute, it's also confusing for first time users. Should only be returned if the user explicitly requests it. diff --git a/GPy/testing/unit_tests.py b/GPy/testing/unit_tests.py index 9191cedd..02a63feb 100644 --- a/GPy/testing/unit_tests.py +++ b/GPy/testing/unit_tests.py @@ -139,13 +139,13 @@ class GradientTests(unittest.TestCase): m.constrain_positive('(linear|bias|white)') self.assertTrue(m.checkgrad()) - def test_simple_GP_EP(self): + def test_GP_EP(self): N = 20 X = np.hstack([np.random.rand(N/2)+1,np.random.rand(N/2)-1])[:,None] k = GPy.kern.rbf(1) + GPy.kern.white(1) Y = np.hstack([np.ones(N/2),-np.ones(N/2)])[:,None] likelihood = GPy.inference.likelihoods.probit(Y) - m = GPy.models.simple_GP_EP(X,likelihood,k) + m = GPy.models.GP_EP(X,likelihood,k) m.constrain_positive('(var|len)') m.approximate_likelihood() self.assertTrue(m.checkgrad()) diff --git a/GPy/util/linalg.py b/GPy/util/linalg.py index 8afffce3..092589a9 100644 --- a/GPy/util/linalg.py +++ b/GPy/util/linalg.py @@ -46,19 +46,14 @@ def _mdot_r(a,b): def jitchol(A,maxtries=5): """ - Arguments - --------- - A : An almost pd square matrix + :param A : An almost pd square matrix - Returns - ------- - cholesky(K) + :rval L: the Cholesky decomposition of A - Notes - ----- - Adds jitter to K, to enforce positive-definiteness - if stuff breaks, please check: - np.allclose(sp.linalg.cholesky(XXT, lower = True), np.triu(sp.linalg.cho_factor(XXT)[0]).T) + .. Note: + Adds jitter to K, to enforce positive-definiteness + if stuff breaks, please check: + np.allclose(sp.linalg.cholesky(XXT, lower = True), np.triu(sp.linalg.cho_factor(XXT)[0]).T) """ try: return linalg.cholesky(A, lower = True) @@ -78,23 +73,23 @@ def jitchol(A,maxtries=5): def pdinv(A): """ - Arguments - --------- :param A: A DxD pd numpy array - Returns - ------- - inv : the inverse of A - hld: 0.5* the log of the determinant of A + :rval Ai: the inverse of A + :rtype Ai: np.ndarray + :rval L: the Cholesky decomposition of A + :rtype L: np.ndarray + :rval Li: the Cholesky decomposition of Ai + :rtype Li: np.ndarray + :rval logdet: the log of the determinant of A + :rtype logdet: float64 """ L = jitchol(A) - hld = np.sum(np.log(np.diag(L))) + logdet = 2.*np.sum(np.log(np.diag(L))) + Li = chol_inv(L) + Ai = np.dot(Li.T,Li) #TODO: get the flapack routine form multiplying triangular matrices - inv = sp.lib.lapack.flapack.dpotri(L)[0] - # inv = linalg.flapack.dpotri(L,lower = 1)[0] - inv = np.tril(inv)+np.tril(inv,-1).T - - return inv, hld + return Ai, L, Li, logdet def chol_inv(L): @@ -139,7 +134,7 @@ def PCA(Y, Q): Returns ------- - X - NxQ np.array of dimensionality reduced data + :rval X: - NxQ np.array of dimensionality reduced data W - QxD mapping from X to Y """ if not np.allclose(Y.mean(axis=0), 0.0): diff --git a/setup.py b/setup.py index ed841b56..076af82c 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,7 @@ setup(name = 'GPy', package_dir={'GPy': 'GPy'}, package_data = {'GPy': ['GPy/examples']}, py_modules = ['GPy.__init__'], - long_description=read('README'), + long_description=read('README.md'), ext_modules = [Extension(name = 'GPy.kern.lfmUpsilonf2py', sources = ['GPy/kern/src/lfmUpsilonf2py.f90'])], install_requires=['numpy>=1.6', 'scipy>=0.9','matplotlib>=1.1'],