Clustering using direct model comparison

2026-07-26 17:11:06 +02:00 · 2016-11-24 18:04:33 +00:00 · 2016-11-24 18:04:33 +00:00 · be77e35b23
commit be77e35b23
parent f2f12787a7
5 changed files with 396 additions and 79 deletions
--- a/GPy/kern/init.py
+++ b/GPy/kern/init.py
@ -27,6 +27,9 @@ from .src.eq_ode2 import EQ_ODE2
 from .src.integral import Integral
 from .src.integral_limits import Integral_Limits
 from .src.multidimensional_integral_limits import Multidimensional_Integral_Limits
+from .src.offset import Offset
+from .src.tie import Tie
+
 from .src.eq_ode1 import EQ_ODE1
 from .src.trunclinear import TruncLinear,TruncLinear_inf
 from .src.splitKern import SplitKern,DEtime
--- a/GPy/kern/src/offset.py
+++ b/GPy/kern/src/offset.py
@ -0,0 +1,72 @@
+# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+# Written by Mike Smith. michaeltsmith.org.uk
+
+from __future__ import division
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from paramz.transformations import Logexp
+import math
+
+class Offset(Kern):
+    """
+    A kernel in which subsets of the data can be shifted.
+    
+    :offsets list of offsets of length of number of subsets
+    """
+    def __init__(self, kernel, input_dim, offsets=[], index_dim=-1, active_dims=None, name='offset'):
+        super(Offset, self).__init__(input_dim, active_dims, name)
+        assert isinstance(index_dim, int), "Offset kernel is only defined with one input dimension being the index"
+        self.kern = kernel
+        
+        self.offsets = Param('offset', offsets)
+        self.link_parameter(self.offsets)
+        
+        self.offset_index_dim = index_dim
+        self.link_parameters(kernel) #maybe whole class should inherit from CombinationKernel?
+
+    def shift(self,X):
+        self.selected = np.array([int(x) for x in X[:,self.offset_index_dim]])
+        offsets = np.hstack([0.0,self.offsets.values])[:,None]
+        return X - offsets[self.selected]
+        
+    def K(self,X ,X2=None):
+        return self.kern.K(self.shift(X),X2)
+
+    def Kdiag(self,X):
+        return self.kern.Kdiag(self.shift(X))
+        
+        
+        
+    def dX_doffset(self,sel,delta):
+        #given the select array (sel) and the offsets (delta)
+        #finds dX/dDelta
+        #returns them as a 2d matrix, one row/col[?] for each offset (delta).
+
+        #a matrix G represents the effect of increasing the offset on the X_i passed to the kernel for each input. For example
+        #what effect will increasing offset 4 have on the kernel output of input 5? Answer: Gs[4,5]... (positive or zero)
+        Gs = []
+        for i,d in enumerate(delta):            
+            G = np.array(sel==(i+1))[:,None]*1
+            Gs.append(G)      
+        return np.array(Gs)*2.0
+            
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        dL_dX = self.kern.gradients_X(dL_dK, self.shift(X), X)
+        dX_doff = self.dX_doffset(self.selected,self.offsets.values)
+        for i in range(len(dX_doff)):
+            dL_doff = dL_dX * dX_doff[i]
+            self.offsets.gradient[i] = -np.sum(dL_doff)
+        self.kern.update_gradients_full(dL_dK,self.shift(X),X2)
+        
+    def gradients_X(self,dL_dK, X, X2=None):
+        return self.kern.gradients_X(dL_dK,self.shift(X),X2)
+
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return self.kern.gradients_X_diag(dL_dKdiag, self.shift(X))
+        
+    def update_gradients_diag(self, dL_dKdiag, X): #TODO What is this?
+        raise NotImplementedError("Not implemented Offset.update_gradients_diag")
+        #self.kern.update_gradients_diag(dL_dKdiag, self.shift(X))
+
--- a/GPy/kern/src/tie.py
+++ b/GPy/kern/src/tie.py
@ -0,0 +1,109 @@
+# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt)
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+# Written by Mike Smith. michaeltsmith.org.uk
+
+from __future__ import division
+import numpy as np
+from .kern import Kern
+from ...core.parameterization import Param
+from paramz.transformations import Logexp
+import math
+
+class Tie(Kern):
+    """
+    A kernel which takes another kernel and ties its parameters together
+    
+    :tied_param_list a list of lists of parameters to tie together. Each item
+                    in the inner list is a regex, so one could use
+                     ['independ.offset.Mat32.lengthscale',
+                      'independ.Mat32_1.lengthscale',
+                      'independ.Mat32.lengthscale']
+                    or just
+                     ['.*lengthscale']
+    """
+    def __init__(self, kernel, input_dim, tied_param_list, active_dims=None, name='tie'):
+           #TODO We need to initialise the values of the parameters to be equal!
+        super(Tie, self).__init__(input_dim, active_dims, name)
+        self.kern = kernel
+
+        self.params = [] #list of parameter objects to tie together
+
+        for tlist in tied_param_list:
+            plist = []  #temp array for list of parameter objects for each tie
+            assert type(tlist) is list, "The tied_param_list should be a list of lists of strings"
+            for t in tlist: #expand regex in each inner list and add all matches
+                plist.extend(self.kern.grep_param_names(t)) 
+                
+            if len(plist)==0:
+                print("Warning: No parameters were added for (%s)" % str(tlist))
+            else:
+                self.params.append(plist)
+            
+        self.link_parameters(self.kern)        
+
+            
+    def get_totals(self,param_list):
+        """
+        Returns the sum total of the gradients and values of the parameters in
+        the param_list
+        """
+        v = None
+        g = None
+        for p in param_list:
+            if v is None:
+                v = p.values.copy()
+                g = p.gradient.copy()
+            else:
+                v += p.values
+                g += p.gradient
+        return v,g
+    
+    def update_gradients_full(self,dL_dK,X,X2=None):
+        self.kern.update_gradients_full(dL_dK,X,X2)
+        for pitem in self.params:
+            l = len(pitem)
+            v,g = self.get_totals(pitem)
+            for p in pitem:
+                #p.param_array[:] = v/l #TODO: Just do once in __init__
+                p.gradient = g/l #pitem['main'].gradient
+            
+    def gradients_X(self,dL_dK, X, X2=None):
+        return self.kern.gradients_X(dL_dK, X, X2=None)
+        
+    def gradients_X_diag(self, dL_dKdiag, X):
+        return self.kern.gradients_X_diag(dL_dKdiag, X)
+
+    def K(self,X ,X2=None):
+        return self.kern.K(X,X2)
+
+    def Kdiag(self,X):
+        return self.kern.Kdiag(X)
+
+    def get_ties_names(self,html=False):
+        textlist = []
+        if html:
+            lb = "<br />"
+        else:
+            lb = "\n"
+        for ps in self.params:
+            innerlist = []
+            for p in ps:
+                innerlist.append(p.hierarchy_name())
+            textlist.append(innerlist)
+        st = lb+lb
+        st += "The following sets of parameters are tied:"
+        for texts in textlist:
+            st += lb
+            st += lb.join(texts)
+            st += lb
+        return st
+                
+    def __str__(self, header=True, VT100=True):
+        st = super(Tie, self).__str__(header, VT100)
+        st += self.get_ties_names()
+        return st
+        
+    def _repr_html_(self, header=True):
+        toprint = super(Tie,self)._repr_html_(header)
+        toprint+= self.get_ties_names(html=True)
+        return toprint
--- a/GPy/util/cluster_with_offset.py
+++ b/GPy/util/cluster_with_offset.py
@ -4,60 +4,72 @@
 import GPy
 import numpy as np

-def add_index_column(inputs,data,clust):
+def get_log_likelihood_diff(inputs,data,clusti,clustj,common_kern,noise_scale):
+    
+    #offset indicies
+    ind_offset = np.vstack([np.zeros([N,1]),np.ones([N,1]),nans])

-    S = data[0].shape[0] #number of time series
-        
-    X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
-    Y = np.zeros([0,S])
-    
-    #for each cluster, add their inputs and data to the new
-    #dataset. Note we add an index identifying which person is which data point.
-    #This is for the offset model to use, to allow it to know which data points
-    #to shift.
-    for i,p in enumerate(clust):
-        idx = i*np.ones([inputs[p].shape[0],1])
-        X = np.vstack([X,np.hstack([inputs[p],idx])])
-        Y = np.vstack([Y,data[p].T])
-    return X,Y
-    
-def get_individual_log_likelihood_offset(inputs,data,clust,common_kern):
-    """Get the LL of a pair of clusters, but having them independent
-    
-    arguments:
-    inputs -- the 'X's in a list, one item per cluster
-    data -- the 'Y's in a list, one item per cluster
-    clust -- list of clusters to use
-    
-    returns log likelihood
-    """
-    X,Y = add_index_column(inputs,data,clust)
-    k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
-    m = GPy.models.GPRegression(X,Y,k_independent)
-    m.optimize()
-    ll=m.log_likelihood()    
-    return ll
+    #independent output indicies
+    ind_indpoutputs = np.vstack([np.zeros([N*2,1]),np.ones([N,1]),np.ones([N,1])*2])
+    X = np.hstack([X,ind_offset,ind_indpoutputs])
+    Y1 = np.sin((X[0:N,0])/10.0)[:,None]
+    #Y2 = np.sin((X[0:N,0])/10.0)[:,None]
+    Y2 = np.cos((X[0:N,0])/10.0)[:,None]
+    Y1 += np.random.randn(Y1.shape[0],Y1.shape[1])*0.1
+    Y2 += np.random.randn(Y2.shape[0],Y2.shape[1])*0.1
+    Y = np.vstack([Y1,Y2,Y1,Y2])

-def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
-    """Get the log likelihood of a combined set of clusters, fitting the offsets
-    
-    arguments:
-    inputs -- the 'X's in a list, one item per cluster
-    data -- the 'Y's in a list, one item per cluster
-    clust -- list of clusters to use
-    
-    returns a tuple:
-    log likelihood and the offset
-    """    
-    X,Y = add_index_column(inputs,data,clust)
-    m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
-    # m.offset.set_prior(GPy.priors.Gaussian(0,20)) 
-    m.optimize()
-    ll = m.log_likelihood()
-    offset = m.offset.values[0]
-    return ll,offset
+    #Structure of inputs:
+    # actual input | offset_kernel_index | indp_output_index
+    #      2.4              0                     0
+    #      2.9              0                     0
+    #      3.4              1                     0
+    #      3.9              1                     0
+    #      2.4              nan                   1
+    #      2.9              nan                   1
+    #      3.4              nan                   2
+    #      3.9              nan                   2
+    #print X
+    #print Y

-def cluster(data,inputs,common_kern,verbose=False):
+    #base kernel to explain all time series with
+    common_kern = GPy.kern.Matern32(input_dim=1)
+
+    #the offset kernel, that can shift one time series wrt another
+    offset_kern = GPy.kern.Offset(common_kern,2,[0])
+
+    #we want to discourage massive offsets, which can achieve good fits by simply moving the two datasets far apart
+    offset_kern.offset.set_prior(GPy.priors.Gaussian(0,4.0))
+
+    #our overall kernel contains our offset kernel and two common kernels
+    independent_kern = GPy.kern.IndependentOutputs([offset_kern,common_kern.copy(),common_kern.copy()],index_dim=2)
+
+    tiekern = GPy.kern.Tie(independent_kern,3,[['.*lengthscale'],['.*variance']])
+
+    model = GPy.models.GPRegression(X,Y,tiekern)
+    model.optimize()
+
+
+
+
+    #base kernel to explain all time series with
+    common_kern = GPy.kern.Matern32(input_dim=1)
+
+    #the offset kernel, that can shift one time series wrt another
+    offset_kern = GPy.kern.Offset(common_kern,2,[0])
+
+    #we want to discourage massive offsets, which can achieve good fits by simply moving the two datasets far apart
+    offset_kern.offset.set_prior(GPy.priors.Gaussian(0,4.0))
+
+    #our overall kernel contains our offset kernel and two common kernels
+    independent_kern = GPy.kern.IndependentOutputs([common_kern.copy(),common_kern.copy()],index_dim=1)
+    independent_model = GPy.models.GPRegression(indepX,indepY,independent_kern)
+
+    offset_model = GPy.models.GPRegression(offsetX,offsetY,offset_kern)
+
+
+
+def cluster(data,inputs,common_kern,noise_scale=1.0,verbose=False):
    """Clusters data
    
    Using the new offset model, this method uses a greedy algorithm to cluster
@ -74,55 +86,44 @@ def cluster(data,inputs,common_kern,verbose=False):
    """
    N=len(data)
    
-    
    #Define a set of N active cluster
    active = []
    for p in range(0,N):
        active.append([p])

-    individualloglikes = np.zeros([len(active),len(active)])
-    individualloglikes[:] = None
-    sharedloglikes = np.zeros([len(active),len(active)])
-    sharedloglikes[:] = None
-    sharedoffset = np.zeros([len(active),len(active)])
+    diffloglikes = np.zeros([len(active),len(active)])
+    diffloglikes[:] = None
+    offsets = np.zeros([len(active),len(active)])

    it = 0
    while True:
-    
        if verbose:
            it +=1
            print("Iteration %d" % it)
-        
-        #Compute the log-likelihood of each cluster
-        for clusti in range(len(active)):
-            #try combining with each other cluster...
+                
+        for clusti in range(len(active)):        
            for clustj in range(clusti): #count from 0 to clustj-1
-                temp = [clusti,clustj]
-                if np.isnan(sharedloglikes[clusti,clustj]):
-                    sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern)    
-                    individualloglikes[clusti,clustj] = get_individual_log_likelihood_offset(inputs,data,temp,common_kern)
+                if np.isnan(diffloglikes[clusti,clustj]):
+                    diffloglikes[clusti,clustj],offsets[clusti,clustj] = get_log_likelihood_diff(inputs,data,[clusti,clustj],common_kern,noise_scale)

-        loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering
-        top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape)
+        #np.fill_diagonal(diffloglikes,np.nan)
        
-        if loglikeimprovement[top[0],top[1]]>0:
+        top = np.unravel_index(np.nanargmax(diffloglikes), diffloglikes.shape)
+
+        if diffloglikes[top[0],top[1]]>0:
            active[top[0]].extend(active[top[1]])
-            offset=sharedoffset[top[0],top[1]]
+            offset=offsets[top[0],top[1]]
            inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
            data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
            del inputs[top[1]]
            del data[top[1]]
            del active[top[1]]
-
+            
            #None = we need to recalculate
-            sharedloglikes[:,top[0]] = None 
-            sharedloglikes[top[0],:] = None 
-            sharedloglikes = np.delete(sharedloglikes,top[1],0)
-            sharedloglikes = np.delete(sharedloglikes,top[1],1)
-            individualloglikes[:,top[0]] = None 
-            individualloglikes[top[0],:] = None 
-            individualloglikes = np.delete(individualloglikes,top[1],0)
-            individualloglikes = np.delete(individualloglikes,top[1],1)
+            diffloglikes[:,top[0]] = None
+            diffloglikes[top[0],:] = None
+            diffloglikes = np.delete(diffloglikes,top[1],0)
+            diffloglikes = np.delete(diffloglikes,top[1],1)
        else:
            break
            
--- a/GPy/util/cluster_with_offset_using_loo.py
+++ b/GPy/util/cluster_with_offset_using_loo.py
@ -0,0 +1,132 @@
+# Copyright (c) 2016, Mike Smith
+# Licensed under the BSD 3-clause license (see LICENSE.txt)
+
+import GPy
+import numpy as np
+
+def add_index_column(inputs,data,clust):
+
+    S = data[0].shape[0] #number of time series
+        
+    X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
+    Y = np.zeros([0,S])
+    
+    #for each cluster, add their inputs and data to the new
+    #dataset. Note we add an index identifying which person is which data point.
+    #This is for the offset model to use, to allow it to know which data points
+    #to shift.
+    for i,p in enumerate(clust):
+        idx = i*np.ones([inputs[p].shape[0],1])
+        X = np.vstack([X,np.hstack([inputs[p],idx])])
+        Y = np.vstack([Y,data[p].T])
+    return X,Y
+    
+def get_individual_log_likelihood_offset(inputs,data,clust,common_kern,noise_scale):
+    m = GPy.models.GPRegression(inputs[clust],data[clust].T,common_kern.copy())
+    m.Gaussian_noise.fix(noise_scale)
+    m.optimize()
+    #m.optimize_restarts(2,verbose=False)
+    ll = np.sum(m.LOO())
+    return ll
+
+def get_shared_log_likelihood_offset(inputs,data,clust,common_kern,noise_scale):
+    """Get the log likelihood of a combined set of clusters, fitting the offsets
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns a tuple:
+    log likelihood and the offset
+    """    
+    X,Y = add_index_column(inputs,data,clust)
+    m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
+    m.offset.set_prior(GPy.priors.Gaussian(0,20)) 
+    m.Gaussian_noise.fix(noise_scale)
+    try:
+        m.optimize()
+        #m.optimize_restarts(2,verbose=False)
+    except np.linalg.LinAlgError:
+        print("Optimization failed: assuming poor fit")
+        return -10000,0 #not being able to fit this is probably a sign this isn't a good fit?? TODO
+    ll = np.sum(m.LOO())
+    offset = m.offset.values[0]
+    return ll,offset
+
+def cluster(data,inputs,common_kern,noise_scale=1.0,verbose=False):
+    """Clusters data
+    
+    Using the new offset model, this method uses a greedy algorithm to cluster
+    the data. It starts with all the data points in separate clusters and tests
+    whether combining them increases the overall log-likelihood (LL). It then
+    iteratively joins pairs of clusters which cause the greatest increase in
+    the LL, until no join increases the LL.
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    
+    returns a list of the clusters.    
+    """
+    N=len(data)
+    
+    
+    #Define a set of N active cluster
+    active = []
+    for p in range(0,N):
+        active.append([p])
+
+    individualloglikes = np.zeros([len(active)])
+    individualloglikes[:] = None
+    sharedloglikes = np.zeros([len(active),len(active)])
+    sharedloglikes[:] = None
+    sharedoffset = np.zeros([len(active),len(active)])
+
+    it = 0
+    while True:
+    
+        if verbose:
+            it +=1
+            print("Iteration %d" % it)
+        
+        #Compute the log-likelihood of each cluster
+        for clusti in range(len(active)):
+            individualloglikes[clusti] = get_individual_log_likelihood_offset(inputs,data,clusti,common_kern,noise_scale)
+
+            #try combining with each other cluster...        
+        for clusti in range(len(active)):        
+            for clustj in range(clusti): #count from 0 to clustj-1
+                temp = [clusti,clustj]
+                if np.isnan(sharedloglikes[clusti,clustj]):
+                    sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern,noise_scale)
+
+        #how much likelihood improves with clustering
+        loglikeimprovement = sharedloglikes - (individualloglikes[:,None] + individualloglikes[None,:]) 
+        np.fill_diagonal(loglikeimprovement,np.nan)
+        
+        top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape)
+        #print loglikeimprovement
+        if loglikeimprovement[top[0],top[1]]>-10:
+            active[top[0]].extend(active[top[1]])
+            offset=sharedoffset[top[0],top[1]]
+            inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
+            data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
+            del inputs[top[1]]
+            del data[top[1]]
+            del active[top[1]]
+
+            #None = we need to recalculate
+            sharedloglikes[:,top[0]] = None 
+            sharedloglikes[top[0],:] = None 
+            sharedloglikes = np.delete(sharedloglikes,top[1],0)
+            sharedloglikes = np.delete(sharedloglikes,top[1],1)
+            individualloglikes[top[0]] = None 
+            individualloglikes[top[1]] = None 
+            individualloglikes = np.delete(individualloglikes,top[1])
+            #print len(individualloglikes)
+        else:
+            break
+            
+    #TODO Add a way to return the offsets applied to all the time series
+    return active