Tidied up generation of X and Y prior to clustering

This commit is contained in:
Michael T Smith 2016-09-06 10:52:25 +01:00
parent 84d2ffcedb
commit 813fbd4162

View file

@ -5,54 +5,7 @@ import GPy
import numpy as np import numpy as np
import sys #so I can print dots import sys #so I can print dots
def get_individual_log_likelihood_offset(inputs,data,clust,common_kern): def add_index_column(inputs,data,clust):
"""Get the LL of a pair of clusters, but having them independent
Get the log likelihood of a cluster pair.
arguments:
inputs -- the 'X's in a list, one item per cluster
data -- the 'Y's in a list, one item per cluster
clust -- list of clusters to use
returns a tuple:
log likelihood and the offset
"""
S = data[0].shape[0] #number of time series
#build a new dataset from the clusters, by combining all clusters together
X = np.zeros([0,1])
Y = np.zeros([0,S])
#for each person in the cluster,
#add their inputs and data to the new dataset
idx = np.zeros([0,1])
for it,p in enumerate(clust):
X = np.vstack([X,inputs[p]])
Y = np.vstack([Y,data[p].T])
idx = np.r_[idx,np.ones([data[p].shape[1],1])*it]
X = np.c_[X,idx]
#print(X)
k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
m = GPy.models.GPRegression(X,Y,k_independent)
m.optimize()
ll=m.log_likelihood()
return ll,0
def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
"""Get the log likelihood of a combined set of clusters, fitting the offsets
arguments:
inputs -- the 'X's in a list, one item per cluster
data -- the 'Y's in a list, one item per cluster
clust -- list of clusters to use
returns a tuple:
log likelihood and the offset
"""
S = data[0].shape[0] #number of time series S = data[0].shape[0] #number of time series
@ -67,16 +20,40 @@ def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
idx = i*np.ones([inputs[p].shape[0],1]) idx = i*np.ones([inputs[p].shape[0],1])
X = np.vstack([X,np.hstack([inputs[p],idx])]) X = np.vstack([X,np.hstack([inputs[p],idx])])
Y = np.vstack([Y,data[p].T]) Y = np.vstack([Y,data[p].T])
return X,Y
m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy()) def get_individual_log_likelihood_offset(inputs,data,clust,common_kern):
"""Get the LL of a pair of clusters, but having them independent
#TODO: How to select a sensible prior? arguments:
## m.offset.set_prior(GPy.priors.Gaussian(0,20)) inputs -- the 'X's in a list, one item per cluster
#TODO: Set a sensible start value for the length scale, data -- the 'Y's in a list, one item per cluster
#make it long to help the offset fit. clust -- list of clusters to use
returns log likelihood
"""
X,Y = add_index_column(inputs,data,clust)
k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
m = GPy.models.GPRegression(X,Y,k_independent)
m.optimize() m.optimize()
ll=m.log_likelihood()
return ll
def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
"""Get the log likelihood of a combined set of clusters, fitting the offsets
arguments:
inputs -- the 'X's in a list, one item per cluster
data -- the 'Y's in a list, one item per cluster
clust -- list of clusters to use
returns a tuple:
log likelihood and the offset
"""
X,Y = add_index_column(inputs,data,clust)
m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
# m.offset.set_prior(GPy.priors.Gaussian(0,20))
m.optimize()
ll = m.log_likelihood() ll = m.log_likelihood()
offset = m.offset.values[0] offset = m.offset.values[0]
return ll,offset return ll,offset
@ -124,24 +101,12 @@ def cluster(data,inputs,common_kern,verbose=False):
temp = [clusti,clustj] temp = [clusti,clustj]
if np.isnan(sharedloglikes[clusti,clustj]): if np.isnan(sharedloglikes[clusti,clustj]):
sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern) sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern)
individualloglikes[clusti,clustj], unused_offset = get_individual_log_likelihood_offset(inputs,data,temp,common_kern) individualloglikes[clusti,clustj] = get_individual_log_likelihood_offset(inputs,data,temp,common_kern)
loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering
top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape) top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape)
#if loglikeimprovement.shape[0]<3:
# #no more clustering to do - this shouldn't happen really unless
# #we've set the threshold to apply clustering to less than 0
# break
#if theres further clustering to be done...
# print("SHARED")
# print(sharedloglikes)
# print("IND")
# print(individualloglikes)
# print("----")
if loglikeimprovement[top[0],top[1]]>0: if loglikeimprovement[top[0],top[1]]>0:
#print(loglikeimprovement[top[0],top[1]])
active[top[0]].extend(active[top[1]]) active[top[0]].extend(active[top[1]])
offset=sharedoffset[top[0],top[1]] offset=sharedoffset[top[0],top[1]]
inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset]) inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
@ -150,7 +115,7 @@ def cluster(data,inputs,common_kern,verbose=False):
del data[top[1]] del data[top[1]]
del active[top[1]] del active[top[1]]
#None = message to say we need to recalculate #None = we need to recalculate
sharedloglikes[:,top[0]] = None sharedloglikes[:,top[0]] = None
sharedloglikes[top[0],:] = None sharedloglikes[top[0],:] = None
sharedloglikes = np.delete(sharedloglikes,top[1],0) sharedloglikes = np.delete(sharedloglikes,top[1],0)
@ -162,17 +127,5 @@ def cluster(data,inputs,common_kern,verbose=False):
else: else:
break break
#if loglikeimprovement[top[0],top[1]]>0:
# print "joined"
# print top
# print offset
# print offsets
# print offsets[top[1]]-offsets[top[0]]
#TODO Add a way to return the offsets applied to all the time series #TODO Add a way to return the offsets applied to all the time series
return active return active
#starttime = time.time()
#active = cluster(data,inputs)
#endtime = time.time()
#print "TOTAL TIME %0.4f" % (endtime-starttime)