mirror of
https://github.com/SheffieldML/GPy.git
synced 2026-06-08 15:05:15 +02:00
Tidied up generation of X and Y prior to clustering
This commit is contained in:
parent
84d2ffcedb
commit
813fbd4162
1 changed files with 35 additions and 82 deletions
|
|
@ -5,54 +5,7 @@ import GPy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sys #so I can print dots
|
import sys #so I can print dots
|
||||||
|
|
||||||
def get_individual_log_likelihood_offset(inputs,data,clust,common_kern):
|
def add_index_column(inputs,data,clust):
|
||||||
"""Get the LL of a pair of clusters, but having them independent
|
|
||||||
|
|
||||||
Get the log likelihood of a cluster pair.
|
|
||||||
|
|
||||||
arguments:
|
|
||||||
inputs -- the 'X's in a list, one item per cluster
|
|
||||||
data -- the 'Y's in a list, one item per cluster
|
|
||||||
clust -- list of clusters to use
|
|
||||||
|
|
||||||
returns a tuple:
|
|
||||||
log likelihood and the offset
|
|
||||||
"""
|
|
||||||
|
|
||||||
S = data[0].shape[0] #number of time series
|
|
||||||
|
|
||||||
#build a new dataset from the clusters, by combining all clusters together
|
|
||||||
X = np.zeros([0,1])
|
|
||||||
Y = np.zeros([0,S])
|
|
||||||
|
|
||||||
#for each person in the cluster,
|
|
||||||
#add their inputs and data to the new dataset
|
|
||||||
idx = np.zeros([0,1])
|
|
||||||
for it,p in enumerate(clust):
|
|
||||||
X = np.vstack([X,inputs[p]])
|
|
||||||
Y = np.vstack([Y,data[p].T])
|
|
||||||
idx = np.r_[idx,np.ones([data[p].shape[1],1])*it]
|
|
||||||
|
|
||||||
X = np.c_[X,idx]
|
|
||||||
#print(X)
|
|
||||||
k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
|
|
||||||
|
|
||||||
m = GPy.models.GPRegression(X,Y,k_independent)
|
|
||||||
m.optimize()
|
|
||||||
ll=m.log_likelihood()
|
|
||||||
return ll,0
|
|
||||||
|
|
||||||
def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
|
|
||||||
"""Get the log likelihood of a combined set of clusters, fitting the offsets
|
|
||||||
|
|
||||||
arguments:
|
|
||||||
inputs -- the 'X's in a list, one item per cluster
|
|
||||||
data -- the 'Y's in a list, one item per cluster
|
|
||||||
clust -- list of clusters to use
|
|
||||||
|
|
||||||
returns a tuple:
|
|
||||||
log likelihood and the offset
|
|
||||||
"""
|
|
||||||
|
|
||||||
S = data[0].shape[0] #number of time series
|
S = data[0].shape[0] #number of time series
|
||||||
|
|
||||||
|
|
@ -67,16 +20,40 @@ def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
|
||||||
idx = i*np.ones([inputs[p].shape[0],1])
|
idx = i*np.ones([inputs[p].shape[0],1])
|
||||||
X = np.vstack([X,np.hstack([inputs[p],idx])])
|
X = np.vstack([X,np.hstack([inputs[p],idx])])
|
||||||
Y = np.vstack([Y,data[p].T])
|
Y = np.vstack([Y,data[p].T])
|
||||||
|
return X,Y
|
||||||
|
|
||||||
m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
|
def get_individual_log_likelihood_offset(inputs,data,clust,common_kern):
|
||||||
|
"""Get the LL of a pair of clusters, but having them independent
|
||||||
|
|
||||||
#TODO: How to select a sensible prior?
|
arguments:
|
||||||
## m.offset.set_prior(GPy.priors.Gaussian(0,20))
|
inputs -- the 'X's in a list, one item per cluster
|
||||||
#TODO: Set a sensible start value for the length scale,
|
data -- the 'Y's in a list, one item per cluster
|
||||||
#make it long to help the offset fit.
|
clust -- list of clusters to use
|
||||||
|
|
||||||
|
returns log likelihood
|
||||||
|
"""
|
||||||
|
X,Y = add_index_column(inputs,data,clust)
|
||||||
|
k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
|
||||||
|
m = GPy.models.GPRegression(X,Y,k_independent)
|
||||||
m.optimize()
|
m.optimize()
|
||||||
|
ll=m.log_likelihood()
|
||||||
|
return ll
|
||||||
|
|
||||||
|
def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
|
||||||
|
"""Get the log likelihood of a combined set of clusters, fitting the offsets
|
||||||
|
|
||||||
|
arguments:
|
||||||
|
inputs -- the 'X's in a list, one item per cluster
|
||||||
|
data -- the 'Y's in a list, one item per cluster
|
||||||
|
clust -- list of clusters to use
|
||||||
|
|
||||||
|
returns a tuple:
|
||||||
|
log likelihood and the offset
|
||||||
|
"""
|
||||||
|
X,Y = add_index_column(inputs,data,clust)
|
||||||
|
m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
|
||||||
|
# m.offset.set_prior(GPy.priors.Gaussian(0,20))
|
||||||
|
m.optimize()
|
||||||
ll = m.log_likelihood()
|
ll = m.log_likelihood()
|
||||||
offset = m.offset.values[0]
|
offset = m.offset.values[0]
|
||||||
return ll,offset
|
return ll,offset
|
||||||
|
|
@ -124,24 +101,12 @@ def cluster(data,inputs,common_kern,verbose=False):
|
||||||
temp = [clusti,clustj]
|
temp = [clusti,clustj]
|
||||||
if np.isnan(sharedloglikes[clusti,clustj]):
|
if np.isnan(sharedloglikes[clusti,clustj]):
|
||||||
sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern)
|
sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern)
|
||||||
individualloglikes[clusti,clustj], unused_offset = get_individual_log_likelihood_offset(inputs,data,temp,common_kern)
|
individualloglikes[clusti,clustj] = get_individual_log_likelihood_offset(inputs,data,temp,common_kern)
|
||||||
|
|
||||||
loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering
|
loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering
|
||||||
top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape)
|
top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape)
|
||||||
|
|
||||||
#if loglikeimprovement.shape[0]<3:
|
|
||||||
# #no more clustering to do - this shouldn't happen really unless
|
|
||||||
# #we've set the threshold to apply clustering to less than 0
|
|
||||||
# break
|
|
||||||
|
|
||||||
#if theres further clustering to be done...
|
|
||||||
# print("SHARED")
|
|
||||||
# print(sharedloglikes)
|
|
||||||
# print("IND")
|
|
||||||
# print(individualloglikes)
|
|
||||||
# print("----")
|
|
||||||
if loglikeimprovement[top[0],top[1]]>0:
|
if loglikeimprovement[top[0],top[1]]>0:
|
||||||
#print(loglikeimprovement[top[0],top[1]])
|
|
||||||
active[top[0]].extend(active[top[1]])
|
active[top[0]].extend(active[top[1]])
|
||||||
offset=sharedoffset[top[0],top[1]]
|
offset=sharedoffset[top[0],top[1]]
|
||||||
inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
|
inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
|
||||||
|
|
@ -150,7 +115,7 @@ def cluster(data,inputs,common_kern,verbose=False):
|
||||||
del data[top[1]]
|
del data[top[1]]
|
||||||
del active[top[1]]
|
del active[top[1]]
|
||||||
|
|
||||||
#None = message to say we need to recalculate
|
#None = we need to recalculate
|
||||||
sharedloglikes[:,top[0]] = None
|
sharedloglikes[:,top[0]] = None
|
||||||
sharedloglikes[top[0],:] = None
|
sharedloglikes[top[0],:] = None
|
||||||
sharedloglikes = np.delete(sharedloglikes,top[1],0)
|
sharedloglikes = np.delete(sharedloglikes,top[1],0)
|
||||||
|
|
@ -162,17 +127,5 @@ def cluster(data,inputs,common_kern,verbose=False):
|
||||||
else:
|
else:
|
||||||
break
|
break
|
||||||
|
|
||||||
#if loglikeimprovement[top[0],top[1]]>0:
|
|
||||||
# print "joined"
|
|
||||||
# print top
|
|
||||||
# print offset
|
|
||||||
# print offsets
|
|
||||||
# print offsets[top[1]]-offsets[top[0]]
|
|
||||||
|
|
||||||
#TODO Add a way to return the offsets applied to all the time series
|
#TODO Add a way to return the offsets applied to all the time series
|
||||||
return active
|
return active
|
||||||
|
|
||||||
#starttime = time.time()
|
|
||||||
#active = cluster(data,inputs)
|
|
||||||
#endtime = time.time()
|
|
||||||
#print "TOTAL TIME %0.4f" % (endtime-starttime)
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue