Fixed clustering maths: Previously was comparing models with unequal numbers of points and different parameters

This commit is contained in:
Michael T Smith 2016-09-06 10:24:46 +01:00
parent 532d7188e7
commit 84d2ffcedb
2 changed files with 47 additions and 53 deletions

View file

@ -86,7 +86,7 @@ class GPOffsetRegression(GP):
self.X = self.X_fixed - offsets[self.selected]
super(GPOffsetRegression, self).parameters_changed()
dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK']
dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK'] #TODO could use gradients_X, instead of r.
dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values)
for i in range(len(dr_doff)):

View file

@ -5,12 +5,10 @@ import GPy
import numpy as np
import sys #so I can print dots
def get_log_likelihood(inputs,data,clust):
"""Get the LL of a combined set of clusters, ignoring time series offsets.
def get_individual_log_likelihood_offset(inputs,data,clust,common_kern):
"""Get the LL of a pair of clusters, but having them independent
Get the log likelihood of a cluster without worrying about the fact
different time series are offset. We're using it here really for those
cases in which we only have one cluster to get the loglikelihood of.
Get the log likelihood of a cluster pair.
arguments:
inputs -- the 'X's in a list, one item per cluster
@ -18,7 +16,7 @@ def get_log_likelihood(inputs,data,clust):
clust -- list of clusters to use
returns a tuple:
log likelihood and the offset (which is always zero for this model)
log likelihood and the offset
"""
S = data[0].shape[0] #number of time series
@ -29,23 +27,22 @@ def get_log_likelihood(inputs,data,clust):
#for each person in the cluster,
#add their inputs and data to the new dataset
for p in clust:
idx = np.zeros([0,1])
for it,p in enumerate(clust):
X = np.vstack([X,inputs[p]])
Y = np.vstack([Y,data[p].T])
idx = np.r_[idx,np.ones([data[p].shape[1],1])*it]
#find the loglikelihood. We just add together the LL for each time series.
#ll=0
#for s in range(S):
# m = GPy.models.GPRegression(X,Y[:,s][:,None])
# m.optimize()
# ll+=m.log_likelihood()
X = np.c_[X,idx]
#print(X)
k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1)
m = GPy.models.GPRegression(X,Y)
m = GPy.models.GPRegression(X,Y,k_independent)
m.optimize()
ll=m.log_likelihood()
return ll,0
def get_log_likelihood_offset(inputs,data,clust):
def get_shared_log_likelihood_offset(inputs,data,clust,common_kern):
"""Get the log likelihood of a combined set of clusters, fitting the offsets
arguments:
@ -57,17 +54,12 @@ def get_log_likelihood_offset(inputs,data,clust):
log likelihood and the offset
"""
#if we've only got one cluster, the model has an error, so we want to just
#use normal GPRegression.
if len(clust)==1:
return get_log_likelihood(inputs,data,clust)
S = data[0].shape[0] #number of time series
X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
Y = np.zeros([0,S])
#for each person in the cluster, add their inputs and data to the new
#for each cluster, add their inputs and data to the new
#dataset. Note we add an index identifying which person is which data point.
#This is for the offset model to use, to allow it to know which data points
#to shift.
@ -76,9 +68,10 @@ def get_log_likelihood_offset(inputs,data,clust):
X = np.vstack([X,np.hstack([inputs[p],idx])])
Y = np.vstack([Y,data[p].T])
m = GPy.models.GPOffsetRegression(X,Y)
m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy())
#TODO: How to select a sensible prior?
m.offset.set_prior(GPy.priors.Gaussian(0,20))
## m.offset.set_prior(GPy.priors.Gaussian(0,20))
#TODO: Set a sensible start value for the length scale,
#make it long to help the offset fit.
@ -88,7 +81,7 @@ def get_log_likelihood_offset(inputs,data,clust):
offset = m.offset.values[0]
return ll,offset
def cluster(data,inputs,verbose=False):
def cluster(data,inputs,common_kern,verbose=False):
"""Clusters data
Using the new offset model, this method uses a greedy algorithm to cluster
@ -111,12 +104,11 @@ def cluster(data,inputs,verbose=False):
for p in range(0,N):
active.append([p])
loglikes = np.zeros(len(active))
loglikes[:] = None
pairloglikes = np.zeros([len(active),len(active)])
pairloglikes[:] = None
pairoffset = np.zeros([len(active),len(active)])
individualloglikes = np.zeros([len(active),len(active)])
individualloglikes[:] = None
sharedloglikes = np.zeros([len(active),len(active)])
sharedloglikes[:] = None
sharedoffset = np.zeros([len(active),len(active)])
it = 0
while True:
@ -125,23 +117,17 @@ def cluster(data,inputs,verbose=False):
it +=1
print("Iteration %d" % it)
#Compute the log-likelihood of each cluster (add them together)
#Compute the log-likelihood of each cluster
for clusti in range(len(active)):
if verbose:
sys.stdout.write('.')
sys.stdout.flush()
if np.isnan(loglikes[clusti]):
loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti])
#try combining with each other cluster...
for clustj in range(clusti): #count from 0 to clustj-1
temp = [clusti,clustj]
if np.isnan(pairloglikes[clusti,clustj]):
pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp)
if np.isnan(sharedloglikes[clusti,clustj]):
sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern)
individualloglikes[clusti,clustj], unused_offset = get_individual_log_likelihood_offset(inputs,data,temp,common_kern)
seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1)
loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering
top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape)
loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering
top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape)
#if loglikeimprovement.shape[0]<3:
# #no more clustering to do - this shouldn't happen really unless
@ -149,9 +135,15 @@ def cluster(data,inputs,verbose=False):
# break
#if theres further clustering to be done...
# print("SHARED")
# print(sharedloglikes)
# print("IND")
# print(individualloglikes)
# print("----")
if loglikeimprovement[top[0],top[1]]>0:
#print(loglikeimprovement[top[0],top[1]])
active[top[0]].extend(active[top[1]])
offset=pairoffset[top[0],top[1]]
offset=sharedoffset[top[0],top[1]]
inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
del inputs[top[1]]
@ -159,12 +151,14 @@ def cluster(data,inputs,verbose=False):
del active[top[1]]
#None = message to say we need to recalculate
pairloglikes[:,top[0]] = None
pairloglikes[top[0],:] = None
pairloglikes = np.delete(pairloglikes,top[1],0)
pairloglikes = np.delete(pairloglikes,top[1],1)
loglikes[top[0]] = None
loglikes = np.delete(loglikes,top[1])
sharedloglikes[:,top[0]] = None
sharedloglikes[top[0],:] = None
sharedloglikes = np.delete(sharedloglikes,top[1],0)
sharedloglikes = np.delete(sharedloglikes,top[1],1)
individualloglikes[:,top[0]] = None
individualloglikes[top[0],:] = None
individualloglikes = np.delete(individualloglikes,top[1],0)
individualloglikes = np.delete(individualloglikes,top[1],1)
else:
break