Fixed clustering maths: Previously was comparing models with unequal numbers of points and different parameters

This commit is contained in:
Michael T Smith 2016-09-06 10:24:46 +01:00
parent 532d7188e7
commit 84d2ffcedb
2 changed files with 47 additions and 53 deletions

View file

@ -86,7 +86,7 @@ class GPOffsetRegression(GP):
self.X = self.X_fixed - offsets[self.selected] self.X = self.X_fixed - offsets[self.selected]
super(GPOffsetRegression, self).parameters_changed() super(GPOffsetRegression, self).parameters_changed()
dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK'] dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK'] #TODO could use gradients_X, instead of r.
dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values) dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values)
for i in range(len(dr_doff)): for i in range(len(dr_doff)):

View file

@ -5,12 +5,10 @@ import GPy
import numpy as np import numpy as np
import sys #so I can print dots import sys #so I can print dots
def get_log_likelihood(inputs,data,clust): def get_individual_log_likelihood_offset(inputs,data,clust,common_kern):
"""Get the LL of a combined set of clusters, ignoring time series offsets. """Get the LL of a pair of clusters, but having them independent
Get the log likelihood of a cluster without worrying about the fact Get the log likelihood of a cluster pair.
different time series are offset. We're using it here really for those
cases in which we only have one cluster to get the loglikelihood of.
arguments: arguments:
inputs -- the 'X's in a list, one item per cluster inputs -- the 'X's in a list, one item per cluster
@ -18,7 +16,7 @@ def get_log_likelihood(inputs,data,clust):
clust -- list of clusters to use clust -- list of clusters to use
returns a tuple: returns a tuple:
log likelihood and the offset (which is always zero for this model) log likelihood and the offset
""" """
S = data[0].shape[0] #number of time series S = data[0].shape[0] #number of time series
@ -29,23 +27,22 @@ def get_log_likelihood(inputs,data,clust):
#for each person in the cluster, #for each person in the cluster,
#add their inputs and data to the new dataset #add their inputs and data to the new dataset
for p in clust: idx = np.zeros([0,1])
for it,p in enumerate(clust):
X = np.vstack([X,inputs[p]]) X = np.vstack([X,inputs[p]])
Y = np.vstack([Y,data[p].T]) Y = np.vstack([Y,data[p].T])
idx = np.r_[idx,np.ones([data[p].shape[1],1])*it]
#find the loglikelihood. We just add together the LL for each time series.
#ll=0
#for s in range(S):
# m = GPy.models.GPRegression(X,Y[:,s][:,None])
# m.optimize()
# ll+=m.log_likelihood()
m = GPy.models.GPRegression(X,Y) X = np.c_[X,idx]
#print(X)
k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
m = GPy.models.GPRegression(X,Y,k_independent)
m.optimize() m.optimize()
ll=m.log_likelihood() ll=m.log_likelihood()
return ll,0 return ll,0
def get_log_likelihood_offset(inputs,data,clust): def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
"""Get the log likelihood of a combined set of clusters, fitting the offsets """Get the log likelihood of a combined set of clusters, fitting the offsets
arguments: arguments:
@ -56,18 +53,13 @@ def get_log_likelihood_offset(inputs,data,clust):
returns a tuple: returns a tuple:
log likelihood and the offset log likelihood and the offset
""" """
#if we've only got one cluster, the model has an error, so we want to just
#use normal GPRegression.
if len(clust)==1:
return get_log_likelihood(inputs,data,clust)
S = data[0].shape[0] #number of time series S = data[0].shape[0] #number of time series
X = np.zeros([0,2]) #notice the extra column, this is for the cluster index X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
Y = np.zeros([0,S]) Y = np.zeros([0,S])
#for each person in the cluster, add their inputs and data to the new #for each cluster, add their inputs and data to the new
#dataset. Note we add an index identifying which person is which data point. #dataset. Note we add an index identifying which person is which data point.
#This is for the offset model to use, to allow it to know which data points #This is for the offset model to use, to allow it to know which data points
#to shift. #to shift.
@ -75,10 +67,11 @@ def get_log_likelihood_offset(inputs,data,clust):
idx = i*np.ones([inputs[p].shape[0],1]) idx = i*np.ones([inputs[p].shape[0],1])
X = np.vstack([X,np.hstack([inputs[p],idx])]) X = np.vstack([X,np.hstack([inputs[p],idx])])
Y = np.vstack([Y,data[p].T]) Y = np.vstack([Y,data[p].T])
m = GPy.models.GPOffsetRegression(X,Y) m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
#TODO: How to select a sensible prior? #TODO: How to select a sensible prior?
m.offset.set_prior(GPy.priors.Gaussian(0,20)) ## m.offset.set_prior(GPy.priors.Gaussian(0,20))
#TODO: Set a sensible start value for the length scale, #TODO: Set a sensible start value for the length scale,
#make it long to help the offset fit. #make it long to help the offset fit.
@ -88,7 +81,7 @@ def get_log_likelihood_offset(inputs,data,clust):
offset = m.offset.values[0] offset = m.offset.values[0]
return ll,offset return ll,offset
def cluster(data,inputs,verbose=False): def cluster(data,inputs,common_kern,verbose=False):
"""Clusters data """Clusters data
Using the new offset model, this method uses a greedy algorithm to cluster Using the new offset model, this method uses a greedy algorithm to cluster
@ -111,12 +104,11 @@ def cluster(data,inputs,verbose=False):
for p in range(0,N): for p in range(0,N):
active.append([p]) active.append([p])
loglikes = np.zeros(len(active)) individualloglikes = np.zeros([len(active),len(active)])
loglikes[:] = None individualloglikes[:] = None
sharedloglikes = np.zeros([len(active),len(active)])
pairloglikes = np.zeros([len(active),len(active)]) sharedloglikes[:] = None
pairloglikes[:] = None sharedoffset = np.zeros([len(active),len(active)])
pairoffset = np.zeros([len(active),len(active)])
it = 0 it = 0
while True: while True:
@ -125,23 +117,17 @@ def cluster(data,inputs,verbose=False):
it +=1 it +=1
print("Iteration %d" % it) print("Iteration %d" % it)
#Compute the log-likelihood of each cluster (add them together) #Compute the log-likelihood of each cluster
for clusti in range(len(active)): for clusti in range(len(active)):
if verbose:
sys.stdout.write('.')
sys.stdout.flush()
if np.isnan(loglikes[clusti]):
loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti])
#try combining with each other cluster... #try combining with each other cluster...
for clustj in range(clusti): #count from 0 to clustj-1 for clustj in range(clusti): #count from 0 to clustj-1
temp = [clusti,clustj] temp = [clusti,clustj]
if np.isnan(pairloglikes[clusti,clustj]): if np.isnan(sharedloglikes[clusti,clustj]):
pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp) sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern)
individualloglikes[clusti,clustj], unused_offset = get_individual_log_likelihood_offset(inputs,data,temp,common_kern)
seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1) loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering
loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape)
top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape)
#if loglikeimprovement.shape[0]<3: #if loglikeimprovement.shape[0]<3:
# #no more clustering to do - this shouldn't happen really unless # #no more clustering to do - this shouldn't happen really unless
@ -149,9 +135,15 @@ def cluster(data,inputs,verbose=False):
# break # break
#if theres further clustering to be done... #if theres further clustering to be done...
# print("SHARED")
# print(sharedloglikes)
# print("IND")
# print(individualloglikes)
# print("----")
if loglikeimprovement[top[0],top[1]]>0: if loglikeimprovement[top[0],top[1]]>0:
#print(loglikeimprovement[top[0],top[1]])
active[top[0]].extend(active[top[1]]) active[top[0]].extend(active[top[1]])
offset=pairoffset[top[0],top[1]] offset=sharedoffset[top[0],top[1]]
inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset]) inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
data[top[0]] = np.hstack([data[top[0]],data[top[1]]]) data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
del inputs[top[1]] del inputs[top[1]]
@ -159,12 +151,14 @@ def cluster(data,inputs,verbose=False):
del active[top[1]] del active[top[1]]
#None = message to say we need to recalculate #None = message to say we need to recalculate
pairloglikes[:,top[0]] = None sharedloglikes[:,top[0]] = None
pairloglikes[top[0],:] = None sharedloglikes[top[0],:] = None
pairloglikes = np.delete(pairloglikes,top[1],0) sharedloglikes = np.delete(sharedloglikes,top[1],0)
pairloglikes = np.delete(pairloglikes,top[1],1) sharedloglikes = np.delete(sharedloglikes,top[1],1)
loglikes[top[0]] = None individualloglikes[:,top[0]] = None
loglikes = np.delete(loglikes,top[1]) individualloglikes[top[0],:] = None
individualloglikes = np.delete(individualloglikes,top[1],0)
individualloglikes = np.delete(individualloglikes,top[1],1)
else: else:
break break