Merge branch 'master' of github.com:SheffieldML/GPy

2026-05-08 03:22:38 +02:00 · 2012-12-10 17:26:05 +00:00 · 2012-12-10 17:26:05 +00:00 · da51f69ec3
commit da51f69ec3
parent e2770b01bb 493f236b90
15 changed files with 167 additions and 101 deletions
--- a/GPy/init.py
+++ b/GPy/init.py
@ -6,5 +6,5 @@ import kern
 import models
 import inference
 import util
-import examples
+#import examples TODO: discuss!
 from core import priors
--- a/GPy/core/model.py
+++ b/GPy/core/model.py
@ -162,6 +162,20 @@ class model(parameterised):
        else:
            self.expand_param(initial_parameters)

+    def ensure_default_constraints(self,warn=False):
+        """
+        Ensure that any variables which should clearly be positive have been constrained somehow.
+        """
+        positive_strings = ['variance','lengthscale', 'precision']
+        for s in positive_strings:
+            for i in self.grep_param_names(s):
+                if not (i in self.all_constrained_indices()):
+                    name = self.get_param_names()[i]
+                    self.constrain_positive(name)
+                    if warn:
+                        print "Warning! constraining %s postive"%name
+
+
    def optimize(self, optimizer=None, start=None, **kwargs):
        """
        Optimize the model using self.log_likelihood and self.log_likelihood_gradient, as well as self.priors.
--- a/GPy/inference/Expectation_Propagation.py
+++ b/GPy/inference/Expectation_Propagation.py
@ -11,45 +11,21 @@ from ..util.linalg import pdinv,mdot,jitchol
 from ..util.plot import gpplot
 from .. import kern

-class EP:
-    def __init__(self,covariance,likelihood,Kmn=None,Knn_diag=None,epsilon=1e-3,powerep=[1.,1.]):
-        """
-        Expectation Propagation
+class EP_base:
+    """
+    Expectation Propagation.

-        Arguments
-        ---------
-        X : input observations
-        likelihood : Output's likelihood (likelihood class)
-        kernel :  a GPy kernel (kern class)
-        inducing : Either an array specifying the inducing points location or a sacalar defining their number. None value for using a non-sparse model is used.
-        powerep : Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
-        epsilon : Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
-        """
+    This is just the base class for expectation propagation. We'll extend it for full and sparse EP.
+    """
+    def __init__(self,likelihood,epsilon=1e-3,powerep=[1.,1.]):
        self.likelihood = likelihood
-        assert covariance.shape[0] == covariance.shape[1]
-        if Kmn is not None:
-            self.Kmm = covariance
-            self.Kmn = Kmn
-            self.M = self.Kmn.shape[0]
-            self.N = self.Kmn.shape[1]
-            assert self.M < self.N, 'The number of inducing inputs must be smaller than the number of observations'
-        else:
-            self.K = covariance
-            self.N = self.K.shape[0]
-        if Knn_diag is not None:
-            self.Knn_diag = Knn_diag
-            assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N'
-
        self.epsilon = epsilon
        self.eta, self.delta = powerep
        self.jitter = 1e-12

-        """
-        Initial values - Likelihood approximation parameters:
-        p(y|f) = t(f|tau_tilde,v_tilde)
-        """
-        self.tau_tilde = np.zeros(self.N)
-        self.v_tilde = np.zeros(self.N)
+        #Initial values - Likelihood approximation parameters:
+        #p(y|f) = t(f|tau_tilde,v_tilde)
+        self.restart_EP()

    def restart_EP(self):
        """
@ -59,8 +35,23 @@ class EP:
        self.v_tilde = np.zeros(self.N)
        self.mu = np.zeros(self.N)

-class Full(EP):
-    def fit_EP(self):
+class Full(EP_base):
+    """
+    :param likelihood: Output's likelihood (e.g. probit)
+    :type likelihood: GPy.inference.likelihood instance
+    :param K: prior covariance matrix
+    :type K: np.ndarray (N x N)
+    :param likelihood: Output's likelihood (e.g. probit)
+    :type likelihood: GPy.inference.likelihood instance
+    :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
+    :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
+    """
+    def __init__(self,K,likelihood,*args,**kwargs):
+        assert K.shape[0] == K.shape[1]
+        self.K = K
+        self.N = self.K.shape[0]
+        EP_base.__init__(self,likelihood,*args,**kwargs)
+    def fit_EP(self,messages=False):
        """
        The expectation-propagation algorithm.
        For nomenclature see Rasmussen & Williams 2006 (pag. 52-60)
@ -78,15 +69,16 @@ class Full(EP):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
+
+        self.tau_ = np.empty(self.N,dtype=np.float64)
+        self.v_ = np.empty(self.N,dtype=np.float64)

        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
+        z = np.empty(self.N,dtype=np.float64)
+        self.Z_hat = np.empty(self.N,dtype=np.float64)
+        phi = np.empty(self.N,dtype=np.float64)
+        mu_hat = np.empty(self.N,dtype=np.float64)
+        sigma2_hat = np.empty(self.N,dtype=np.float64)

        #Approximation
        epsilon_np1 = self.epsilon + 1.
@ -95,8 +87,7 @@ class Full(EP):
        self.np1 = [self.tau_tilde.copy()]
        self.np2 = [self.v_tilde.copy()]
        while epsilon_np1 > self.epsilon or epsilon_np2 > self.epsilon:
-            update_order = np.arange(self.N)
-            random.shuffle(update_order)
+            update_order = np.random.permutation(self.N)
            for i in update_order:
                #Cavity distribution parameters
                self.tau_[i] = 1./self.Sigma[i,i] - self.eta*self.tau_tilde[i]
@ -120,14 +111,35 @@ class Full(EP):
            V,info = linalg.flapack.dtrtrs(L,Sroot_tilde_K,lower=1)
            self.Sigma = self.K - np.dot(V.T,V)
            self.mu = np.dot(self.Sigma,self.v_tilde)
-            epsilon_np1 = sum((self.tau_tilde-self.np1[-1])**2)/self.N
-            epsilon_np2 = sum((self.v_tilde-self.np2[-1])**2)/self.N
+            epsilon_np1 = np.mean(self.tau_tilde-self.np1[-1]**2)
+            epsilon_np2 = np.mean(self.v_tilde-self.np2[-1]**2)
            self.np1.append(self.tau_tilde.copy())
            self.np2.append(self.v_tilde.copy())
+            if messages:
+                print "EP iteration %i, epsiolon %d"%(self.iterations,epsilon_np1)

-            self.np2.append(self.v_tilde.copy())
+class FITC(EP_base):
+    """
+    :param likelihood: Output's likelihood (e.g. probit)
+    :type likelihood: GPy.inference.likelihood instance
+    :param Knn_diag: The diagonal elements of Knn is a 1D vector
+    :param Kmn: The 'cross' variance between inducing inputs and data
+    :param Kmm: the covariance matrix of the inducing inputs
+    :param likelihood: Output's likelihood (e.g. probit)
+    :type likelihood: GPy.inference.likelihood instance
+    :param epsilon: Convergence criterion, maximum squared difference allowed between mean updates to stop iterations (float)
+    :param powerep: Power-EP parameters (eta,delta) - 2x1 numpy array (floats)
+    """
+    def __init__(self,likelihood,Knn_diag,Kmn,Kmm,*args,**kwargs):
+        self.Knn_diag = Knn_diag
+        self.Kmn = Kmn
+        self.Kmm = Kmm
+        self.M = self.Kmn.shape[0]
+        self.N = self.Kmn.shape[1]
+        assert self.M <= self.N, 'The number of inducing inputs must be smaller than the number of observations'
+        assert len(Knn_diag) == self.N, 'Knn_diagonal has size different from N'
+        EP_base.__init__(self,likelihood,*args,**kwargs)

-class FITC(EP):
    def fit_EP(self):
        """
        The expectation-propagation algorithm with sparse pseudo-input.
@ -166,15 +178,15 @@ class FITC(EP):
        sigma_ = 1./tau_
        mu_ = v_/tau_
        """
-        self.tau_ = np.empty(self.N,dtype=float)
-        self.v_ = np.empty(self.N,dtype=float)
+        self.tau_ = np.empty(self.N,dtype=np.float64)
+        self.v_ = np.empty(self.N,dtype=np.float64)

        #Initial values - Marginal moments
-        z = np.empty(self.N,dtype=float)
-        self.Z_hat = np.empty(self.N,dtype=float)
-        phi = np.empty(self.N,dtype=float)
-        mu_hat = np.empty(self.N,dtype=float)
-        sigma2_hat = np.empty(self.N,dtype=float)
+        z = np.empty(self.N,dtype=np.float64)
+        self.Z_hat = np.empty(self.N,dtype=np.float64)
+        phi = np.empty(self.N,dtype=np.float64)
+        mu_hat = np.empty(self.N,dtype=np.float64)
+        sigma2_hat = np.empty(self.N,dtype=np.float64)

        #Approximation
        epsilon_np1 = 1
--- a/GPy/kern/finite_dimensional.py
+++ b/GPy/kern/finite_dimensional.py
@ -19,7 +19,7 @@ class finite_dimensional(kernpart):
        self.D = D
        self.F = F
        self.G = G
-        self.G_1 ,tmp = pdinv(G)
+        self.G_1 ,L,Li,logdet = pdinv(G)
        self.n = F.shape[0]
        if weights is not None:
            assert weights.shape==(self.n,)
--- a/GPy/kern/kern.py
+++ b/GPy/kern/kern.py
@ -99,6 +99,7 @@ class kern(parameterised):
        newkern.constrained_fixed_values = self.constrained_fixed_values + other.constrained_fixed_values
        newkern.tied_indices = self.tied_indices + [self.Nparam + x for x in other.tied_indices]
        return newkern
+
    def add(self,other):
        """
        Add another kernel to this one. Both kernels are defined on the same _space_
@ -139,7 +140,13 @@ class kern(parameterised):
        [p.set_param(x[s]) for p, s in zip(self.parts, self.param_slices)]

    def get_param_names(self):
-        return sum([[k.name+'_'+str(i)+'_'+n for n in k.get_param_names()] for i,k in enumerate(self.parts)],[])
+        #this is a bit nasty: we wat to distinguish between parts with the same name by appending a count
+        part_names = np.array([k.name for k in self.parts],dtype=np.str)
+        counts = [np.sum(part_names==ni) for i, ni in enumerate(part_names)]
+        cum_counts = [np.sum(part_names[i:]==ni) for i, ni in enumerate(part_names)]
+        names = [name+'_'+str(cum_count) if count>1 else name for name,count,cum_count in zip(part_names,counts,cum_counts)]
+
+        return sum([[name+'_'+n for n in k.get_param_names()] for name,k in zip(names,self.parts)],[])

    def K(self,X,X2=None,slices1=None,slices2=None):
        assert X.shape[1]==self.D
--- a/GPy/models/GP_EP.py
+++ b/GPy/models/GP_EP.py
@ -6,7 +6,7 @@ import numpy as np
 import pylab as pb
 from scipy import stats, linalg
 from .. import kern
-from ..inference.Expectation_Propagation import EP,Full
+from ..inference.Expectation_Propagation import Full
 from ..inference.likelihoods import likelihood,probit#,poisson,gaussian
 from ..core import model
 from ..util.linalg import pdinv,jitchol
--- a/GPy/models/GP_regression.py
+++ b/GPy/models/GP_regression.py
@ -71,7 +71,7 @@ class GP_regression(model):
    def set_param(self,p):
        self.kern.expand_param(p)
        self.K = self.kern.K(self.X,slices1=self.Xslices)
-        self.Ki,self.hld = pdinv(self.K)
+        self.Ki, self.L, self.Li, self.K_logdet = pdinv(self.K)

    def get_param(self):
        return self.kern.extract_param()
@ -90,7 +90,7 @@ class GP_regression(model):
            return -0.5*np.sum(np.multiply(self.Ki, self.Youter))

    def log_likelihood(self):
-        complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - self.D*self.hld
+        complexity_term = -0.5*self.N*self.D*np.log(2.*np.pi) - 0.5*self.D*self.K_logdet
        return complexity_term + self._model_fit_term()

    def dL_dK(self):
--- a/GPy/models/init.py
+++ b/GPy/models/init.py
@ -9,4 +9,3 @@ from warped_GP import warpedGP
 from GP_EP import GP_EP
 from generalized_FITC import generalized_FITC
 from sparse_GPLVM import sparse_GPLVM
-
--- a/GPy/models/generalized_FITC.py
+++ b/GPy/models/generalized_FITC.py
@ -9,7 +9,7 @@ from .. import kern
 from ..core import model
 from ..util.linalg import pdinv,mdot
 from ..util.plot import gpplot
-from ..inference.Expectation_Propagation import EP,Full,FITC
+from ..inference.Expectation_Propagation import FITC
 from ..inference.likelihoods import likelihood,probit

 class generalized_FITC(model):
@ -62,7 +62,7 @@ class generalized_FITC(model):
    def posterior_param(self):
        self.Knn_diag = self.kernel.Kdiag(self.X)
        self.Kmm = self.kernel.K(self.Z)
-        self.Kmmi, self.Kmm_hld = pdinv(self.Kmm)
+        self.Kmmi, self.Lmm, self.Lmmi, self.Kmm_logdet = pdinv(self.Kmm)
        self.Knm = self.kernel.K(self.X,self.Z)
        self.KmmiKmn = np.dot(self.Kmmi,self.Knm.T)
        self.Qnn = np.dot(self.Knm,self.KmmiKmn)
@ -72,10 +72,10 @@ class generalized_FITC(model):
        self.Taut = self.ep_approx.tau_tilde/(1.+ self.ep_approx.tau_tilde*self.Diag0)
        self.KmnTaut = self.Knm.T*self.Taut[None,:]
        self.KmnTautKnm = np.dot(self.KmnTaut, self.Knm)
-        self.Woodbury_inv, self.Woodbury_hld = pdinv(self.Kmm + self.KmnTautKnm)
+        self.Woodbury_inv, self.Wood_L, self.Wood_Li, self.Woodbury_logdet = pdinv(self.Kmm + self.KmnTautKnm)
        self.Qnn_diag = self.Knn_diag - np.diag(self.Qnn) + 1./self.ep_approx.tau_tilde
        self.Qi = -np.dot(self.KmnTaut.T, np.dot(self.Woodbury_inv,self.KmnTaut)) + np.diag(self.Taut)
-        self.hld = 0.5*np.sum(np.log(self.Diag0 + 1./self.ep_approx.tau_tilde)) - self.Kmm_hld + self.Woodbury_hld
+        self.hld = 0.5*np.sum(np.log(self.Diag0 + 1./self.ep_approx.tau_tilde)) - 0.5*self.Kmm_logdet + 0.5*self.Woodbury_logdet

        self.Diag = self.Diag0/(1.+ self.Diag0 * self.ep_approx.tau_tilde)
        self.P = (self.Diag / self.Diag0)[:,None] * self.Knm
--- a/GPy/models/sparse_GP_regression.py
+++ b/GPy/models/sparse_GP_regression.py
@ -87,14 +87,10 @@ class sparse_GP_regression(GP_regression):
        self.V = self.beta*self.Y
        self.psi1V = np.dot(self.psi1, self.V)
        self.psi1VVpsi1 = np.dot(self.psi1V, self.psi1V.T)
-        self.Lm = jitchol(self.Kmm)
-        self.Lmi = chol_inv(self.Lm)
-        self.Kmmi = np.dot(self.Lmi.T, self.Lmi)
+        self.Kmmi, self.Lm, self.Lmi, self.Kmm_logdet = pdinv(self.Kmm)
        self.A = mdot(self.Lmi, self.psi2, self.Lmi.T)
        self.B = np.eye(self.M) + self.beta * self.A
-        self.LB = jitchol(self.B)
-        self.LBi = chol_inv(self.LB)
-        self.Bi = np.dot(self.LBi.T, self.LBi)
+        self.Bi, self.LB, self.LBi, self.B_logdet = pdinv(self.B)
        self.LLambdai = np.dot(self.LBi, self.Lmi)
        self.trace_K = self.psi0 - np.trace(self.A)
        self.LBL_inv = mdot(self.Lmi.T, self.Bi, self.Lmi)
@ -124,7 +120,7 @@ class sparse_GP_regression(GP_regression):
        """
        A = -0.5*self.N*self.D*(np.log(2.*np.pi) - np.log(self.beta))
        B = -0.5*self.beta*self.D*self.trace_K
-        C = -self.D * np.sum(np.log(np.diag(self.LB)))
+        C = -0.5*self.D * self.B_logdet
        D = -0.5*self.beta*self.trYYT
        E = +0.5*np.sum(self.psi1VVpsi1 * self.LBL_inv)
        return A+B+C+D+E
--- a/GPy/models/uncollapsed_sparse_GP.py
+++ b/GPy/models/uncollapsed_sparse_GP.py
@ -39,7 +39,8 @@ class uncollapsed_sparse_GP(sparse_GP_regression):
                M = Z.shape[0]
            else:
                M=M
-            self.set_vb_param(np.hstack((np.ones(M*D)),np.eye(M).flatten()))
+            q_u = np.hstack((np.ones(M*D)),np.eye(M).flatten())
+        self.set_vb_param(q_u)
        sparse_GP_regression.__init__(self, X, Y, *args, **kwargs)

    def _computations(self):
--- a/GPy/notes.txt
+++ b/GPy/notes.txt
@ -0,0 +1,42 @@
+Fails in weird ways if you pass a integer as the input instead of a double to the kernel.
+
+The Matern kernels (at least the 52) still is working in the ARD manner which means it wouldn't run for very large input dimension. Needs to be fixed to match the RBF.
+
+Implementing new covariances is too complicated at the moment. We need a barebones example of what to implement and where. Commenting in the covariance matrices needs to be improved. It's not clear to a user what all the psi parts are for. Maybe we need a cut down and simplified example to help with this (perhaps a cut down version of the RBF?). And then we should provide a simple list of what you need to do to get a new kernel going.
+
+Missing kernels: polynomial, rational quadratic.
+
+Kernel implementations are far to obscure. Need to be easily readable for a first time user.
+
+Need an implementation of scaled conjugate gradients for the optimizers.
+
+Need an implementation of gradient descent for the optimizers (works well with GP-LVM for small random initializations)
+
+Need Carl Rasmussen's permission to add his conjugate gradients algorithm. In fact, we can just provide a hook for it, and post a separate python implementation of his algorithm.
+
+Change get_param and set_param to get_params and set_params
+
+Get constrain param by default inside model creation.
+
+Randomize doesn't seem to cover a wide enough range for restarts ... try it for a model where inputs are widely spaced apart and length scale is too short. Sampling from N(0,1) is too conservative. Dangerous for people who naively use restarts. Since we have the model we could maybe come up with some sensible heuristics for setting these things. Maybe we should also consider having '.initialize()'. If we can't do this well we should disable the restart method.
+
+
+Tolerances for optimizers, do we need to introduce some standardization? At the moment does each have its own defaults?
+
+Do all optimizers work only in terms of function evaluations? Do we need to check for one that uses iterations?
+
+Change Youter to YYT (Youter doesn't mean anything for matrices).
+
+Bug when running classification.crescent_data()
+
+A dictionary for parameter storage? So we can go through names easily?
+
+A flag on covariance functions that indicates when they are not associated with an underlying function (like white noise or a coregionalization matrix).
+
+Diagonal noise covariance function
+
+Long term: automatic Lagrange multiplier calculation for optimizers: constrain two parameters in an unusual way and the model automatically does the Lagrangian. Also augment the parameters with new ones, so define data variance to be white noise plus RBF variance and optimize over that and signal to noise ratio ... for example constrain the sum of variances to equal the known variance of the data.
+
+When computing kernel.K for kernels like rbf, you can't compute a version with rbf.K(X) you have to do rbf.K(X, X)
+
+the predict method for GP_regression returns a covariance matrix which is a bad idea as this takes a lot to compute, it's also confusing for first time users. Should only be returned if the user explicitly requests it. 
--- a/GPy/testing/unit_tests.py
+++ b/GPy/testing/unit_tests.py
@ -139,13 +139,13 @@ class GradientTests(unittest.TestCase):
        m.constrain_positive('(linear|bias|white)')
        self.assertTrue(m.checkgrad())

-    def test_simple_GP_EP(self):
+    def test_GP_EP(self):
        N = 20
        X = np.hstack([np.random.rand(N/2)+1,np.random.rand(N/2)-1])[:,None]
        k = GPy.kern.rbf(1) + GPy.kern.white(1)
        Y = np.hstack([np.ones(N/2),-np.ones(N/2)])[:,None]
        likelihood = GPy.inference.likelihoods.probit(Y)
-        m = GPy.models.simple_GP_EP(X,likelihood,k)
+        m = GPy.models.GP_EP(X,likelihood,k)
        m.constrain_positive('(var|len)')
        m.approximate_likelihood()
        self.assertTrue(m.checkgrad())
--- a/GPy/util/linalg.py
+++ b/GPy/util/linalg.py
@ -46,19 +46,14 @@ def _mdot_r(a,b):

 def jitchol(A,maxtries=5):
    """
-    Arguments
-    ---------
-    A : An almost pd square matrix
+    :param A : An almost pd square matrix

-    Returns
-    -------
-    cholesky(K)
+    :rval L: the Cholesky decomposition of A

-    Notes
-    -----
-    Adds jitter to K, to enforce positive-definiteness
-    if stuff breaks, please check:
-    np.allclose(sp.linalg.cholesky(XXT, lower = True), np.triu(sp.linalg.cho_factor(XXT)[0]).T)
+    .. Note:
+      Adds jitter to K, to enforce positive-definiteness
+      if stuff breaks, please check:
+      np.allclose(sp.linalg.cholesky(XXT, lower = True), np.triu(sp.linalg.cho_factor(XXT)[0]).T)
    """
    try:
        return linalg.cholesky(A, lower = True)
@ -78,23 +73,23 @@ def jitchol(A,maxtries=5):

 def pdinv(A):
    """
-    Arguments
-    ---------
    :param A: A DxD pd numpy array

-    Returns
-    -------
-    inv : the inverse of A
-    hld: 0.5* the log of the determinant of A
+    :rval Ai: the inverse of A
+    :rtype Ai: np.ndarray
+    :rval L: the Cholesky decomposition of A
+    :rtype L: np.ndarray
+    :rval Li: the Cholesky decomposition of Ai
+    :rtype Li: np.ndarray
+    :rval logdet: the log of the determinant of A
+    :rtype logdet: float64
    """
    L = jitchol(A)
-    hld = np.sum(np.log(np.diag(L)))
+    logdet = 2.*np.sum(np.log(np.diag(L)))
+    Li = chol_inv(L)
+    Ai = np.dot(Li.T,Li) #TODO: get the flapack routine form multiplying triangular matrices

-    inv = sp.lib.lapack.flapack.dpotri(L)[0]
-    # inv = linalg.flapack.dpotri(L,lower = 1)[0]
-    inv = np.tril(inv)+np.tril(inv,-1).T
-
-    return inv, hld
+    return Ai, L, Li, logdet


 def chol_inv(L):
@ -139,7 +134,7 @@ def PCA(Y, Q):

    Returns
    -------
-    X - NxQ np.array of dimensionality reduced data
+    :rval X: - NxQ np.array of dimensionality reduced data
    W - QxD mapping from X to Y
    """
    if not np.allclose(Y.mean(axis=0), 0.0):