Tidied up generation of X and Y prior to clustering

2026-07-26 17:11:06 +02:00 · 2016-09-06 10:52:25 +01:00 · 2016-09-06 10:52:25 +01:00 · 813fbd4162
commit 813fbd4162
parent 84d2ffcedb
1 changed files with 35 additions and 82 deletions
--- a/GPy/util/cluster_with_offset.py
+++ b/GPy/util/cluster_with_offset.py
@ -5,54 +5,7 @@ import GPy
 import numpy as np
 import sys #so I can print dots

-def get_individual_log_likelihood_offset(inputs,data,clust,common_kern):
-    """Get the LL of a pair of clusters, but having them independent
-    
-    Get the log likelihood of a cluster pair.
-    
-    arguments:
-    inputs -- the 'X's in a list, one item per cluster
-    data -- the 'Y's in a list, one item per cluster
-    clust -- list of clusters to use
-    
-    returns a tuple:
-    log likelihood and the offset
-    """
- 
-    S = data[0].shape[0] #number of time series
-    
-    #build a new dataset from the clusters, by combining all clusters together
-    X = np.zeros([0,1])
-    Y = np.zeros([0,S])
-    
-    #for each person in the cluster,
-    #add their inputs and data to the new dataset
-    idx = np.zeros([0,1])
-    for it,p in enumerate(clust):
-        X = np.vstack([X,inputs[p]])
-        Y = np.vstack([Y,data[p].T])
-        idx = np.r_[idx,np.ones([data[p].shape[1],1])*it]
-
-    X = np.c_[X,idx]
-    #print(X)
-    k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
-
-    m = GPy.models.GPRegression(X,Y,k_independent)
-    m.optimize()
-    ll=m.log_likelihood()    
-    return ll,0
-
-def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
-    """Get the log likelihood of a combined set of clusters, fitting the offsets
-    
-    arguments:
-    inputs -- the 'X's in a list, one item per cluster
-    data -- the 'Y's in a list, one item per cluster
-    clust -- list of clusters to use
-    
-    returns a tuple:
-    log likelihood and the offset
-    """    
+def add_index_column(inputs,data,clust):

    S = data[0].shape[0] #number of time series
        
@ -67,16 +20,40 @@ def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
        idx = i*np.ones([inputs[p].shape[0],1])
        X = np.vstack([X,np.hstack([inputs[p],idx])])
        Y = np.vstack([Y,data[p].T])
-                      
-    m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
-
-    #TODO: How to select a sensible prior?
-   ## m.offset.set_prior(GPy.priors.Gaussian(0,20)) 
-    #TODO: Set a sensible start value for the length scale,
-    #make it long to help the offset fit.
+    return X,Y
    
+def get_individual_log_likelihood_offset(inputs,data,clust,common_kern):
+    """Get the LL of a pair of clusters, but having them independent
+    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns log likelihood
+    """
+    X,Y = add_index_column(inputs,data,clust)
+    k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
+    m = GPy.models.GPRegression(X,Y,k_independent)
    m.optimize()
+    ll=m.log_likelihood()    
+    return ll
+
+def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
+    """Get the log likelihood of a combined set of clusters, fitting the offsets
    
+    arguments:
+    inputs -- the 'X's in a list, one item per cluster
+    data -- the 'Y's in a list, one item per cluster
+    clust -- list of clusters to use
+    
+    returns a tuple:
+    log likelihood and the offset
+    """    
+    X,Y = add_index_column(inputs,data,clust)
+    m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
+    # m.offset.set_prior(GPy.priors.Gaussian(0,20)) 
+    m.optimize()
    ll = m.log_likelihood()
    offset = m.offset.values[0]
    return ll,offset
@ -124,24 +101,12 @@ def cluster(data,inputs,common_kern,verbose=False):
                temp = [clusti,clustj]
                if np.isnan(sharedloglikes[clusti,clustj]):
                    sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern)    
-                    individualloglikes[clusti,clustj], unused_offset = get_individual_log_likelihood_offset(inputs,data,temp,common_kern)
+                    individualloglikes[clusti,clustj] = get_individual_log_likelihood_offset(inputs,data,temp,common_kern)

        loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering
        top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape)
-
-        #if loglikeimprovement.shape[0]<3:
-        # #no more clustering to do - this shouldn't happen really unless
-        # #we've set the threshold to apply clustering to less than 0
-        #    break 
        
-        #if theres further clustering to be done...
-       # print("SHARED")
-       # print(sharedloglikes)
-       # print("IND")
-       # print(individualloglikes)        
-       # print("----")
        if loglikeimprovement[top[0],top[1]]>0:
-            #print(loglikeimprovement[top[0],top[1]])
            active[top[0]].extend(active[top[1]])
            offset=sharedoffset[top[0],top[1]]
            inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
@ -150,7 +115,7 @@ def cluster(data,inputs,common_kern,verbose=False):
            del data[top[1]]
            del active[top[1]]

-            #None = message to say we need to recalculate
+            #None = we need to recalculate
            sharedloglikes[:,top[0]] = None 
            sharedloglikes[top[0],:] = None 
            sharedloglikes = np.delete(sharedloglikes,top[1],0)
@ -162,17 +127,5 @@ def cluster(data,inputs,common_kern,verbose=False):
        else:
            break
            
-        #if loglikeimprovement[top[0],top[1]]>0:
-        #    print "joined"
-        #    print top
-        #    print offset
-        #    print offsets
-        #    print offsets[top[1]]-offsets[top[0]]
-
    #TODO Add a way to return the offsets applied to all the time series
-    return active 
-
-#starttime = time.time()
-#active = cluster(data,inputs)
-#endtime = time.time()
-#print "TOTAL TIME %0.4f" % (endtime-starttime)
+    return active