diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py index b843e08d..f7a595cf 100644 --- a/GPy/models/gp_offset_regression.py +++ b/GPy/models/gp_offset_regression.py @@ -86,7 +86,7 @@ class GPOffsetRegression(GP): self.X = self.X_fixed - offsets[self.selected] super(GPOffsetRegression, self).parameters_changed() - dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK'] + dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK'] #TODO could use gradients_X, instead of r. dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values) for i in range(len(dr_doff)): diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py index d1e896f1..29f1e9ff 100644 --- a/GPy/util/cluster_with_offset.py +++ b/GPy/util/cluster_with_offset.py @@ -5,12 +5,10 @@ import GPy import numpy as np import sys #so I can print dots -def get_log_likelihood(inputs,data,clust): - """Get the LL of a combined set of clusters, ignoring time series offsets. +def get_individual_log_likelihood_offset(inputs,data,clust,common_kern): + """Get the LL of a pair of clusters, but having them independent - Get the log likelihood of a cluster without worrying about the fact - different time series are offset. We're using it here really for those - cases in which we only have one cluster to get the loglikelihood of. + Get the log likelihood of a cluster pair. arguments: inputs -- the 'X's in a list, one item per cluster @@ -18,7 +16,7 @@ def get_log_likelihood(inputs,data,clust): clust -- list of clusters to use returns a tuple: - log likelihood and the offset (which is always zero for this model) + log likelihood and the offset """ S = data[0].shape[0] #number of time series @@ -29,23 +27,22 @@ def get_log_likelihood(inputs,data,clust): #for each person in the cluster, #add their inputs and data to the new dataset - for p in clust: + idx = np.zeros([0,1]) + for it,p in enumerate(clust): X = np.vstack([X,inputs[p]]) Y = np.vstack([Y,data[p].T]) - - #find the loglikelihood. We just add together the LL for each time series. - #ll=0 - #for s in range(S): - # m = GPy.models.GPRegression(X,Y[:,s][:,None]) - # m.optimize() - # ll+=m.log_likelihood() + idx = np.r_[idx,np.ones([data[p].shape[1],1])*it] - m = GPy.models.GPRegression(X,Y) + X = np.c_[X,idx] + #print(X) + k_independent = GPy.kern.IndependentOutputs(common_kern.copy(),index_dim=1) + + m = GPy.models.GPRegression(X,Y,k_independent) m.optimize() ll=m.log_likelihood() return ll,0 -def get_log_likelihood_offset(inputs,data,clust): +def get_shared_log_likelihood_offset(inputs,data,clust,common_kern): """Get the log likelihood of a combined set of clusters, fitting the offsets arguments: @@ -56,18 +53,13 @@ def get_log_likelihood_offset(inputs,data,clust): returns a tuple: log likelihood and the offset """ - - #if we've only got one cluster, the model has an error, so we want to just - #use normal GPRegression. - if len(clust)==1: - return get_log_likelihood(inputs,data,clust) - + S = data[0].shape[0] #number of time series X = np.zeros([0,2]) #notice the extra column, this is for the cluster index Y = np.zeros([0,S]) - #for each person in the cluster, add their inputs and data to the new + #for each cluster, add their inputs and data to the new #dataset. Note we add an index identifying which person is which data point. #This is for the offset model to use, to allow it to know which data points #to shift. @@ -75,10 +67,11 @@ def get_log_likelihood_offset(inputs,data,clust): idx = i*np.ones([inputs[p].shape[0],1]) X = np.vstack([X,np.hstack([inputs[p],idx])]) Y = np.vstack([Y,data[p].T]) - - m = GPy.models.GPOffsetRegression(X,Y) + + m = GPy.models.GPOffsetRegression(X,Y,common_kern.copy()) + #TODO: How to select a sensible prior? - m.offset.set_prior(GPy.priors.Gaussian(0,20)) + ## m.offset.set_prior(GPy.priors.Gaussian(0,20)) #TODO: Set a sensible start value for the length scale, #make it long to help the offset fit. @@ -88,7 +81,7 @@ def get_log_likelihood_offset(inputs,data,clust): offset = m.offset.values[0] return ll,offset -def cluster(data,inputs,verbose=False): +def cluster(data,inputs,common_kern,verbose=False): """Clusters data Using the new offset model, this method uses a greedy algorithm to cluster @@ -111,12 +104,11 @@ def cluster(data,inputs,verbose=False): for p in range(0,N): active.append([p]) - loglikes = np.zeros(len(active)) - loglikes[:] = None - - pairloglikes = np.zeros([len(active),len(active)]) - pairloglikes[:] = None - pairoffset = np.zeros([len(active),len(active)]) + individualloglikes = np.zeros([len(active),len(active)]) + individualloglikes[:] = None + sharedloglikes = np.zeros([len(active),len(active)]) + sharedloglikes[:] = None + sharedoffset = np.zeros([len(active),len(active)]) it = 0 while True: @@ -125,23 +117,17 @@ def cluster(data,inputs,verbose=False): it +=1 print("Iteration %d" % it) - #Compute the log-likelihood of each cluster (add them together) + #Compute the log-likelihood of each cluster for clusti in range(len(active)): - if verbose: - sys.stdout.write('.') - sys.stdout.flush() - if np.isnan(loglikes[clusti]): - loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti]) - #try combining with each other cluster... for clustj in range(clusti): #count from 0 to clustj-1 temp = [clusti,clustj] - if np.isnan(pairloglikes[clusti,clustj]): - pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp) + if np.isnan(sharedloglikes[clusti,clustj]): + sharedloglikes[clusti,clustj],sharedoffset[clusti,clustj] = get_shared_log_likelihood_offset(inputs,data,temp,common_kern) + individualloglikes[clusti,clustj], unused_offset = get_individual_log_likelihood_offset(inputs,data,temp,common_kern) - seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1) - loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering - top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape) + loglikeimprovement = sharedloglikes - individualloglikes #how much likelihood improves with clustering + top = np.unravel_index(np.nanargmax(sharedloglikes-individualloglikes), sharedloglikes.shape) #if loglikeimprovement.shape[0]<3: # #no more clustering to do - this shouldn't happen really unless @@ -149,9 +135,15 @@ def cluster(data,inputs,verbose=False): # break #if theres further clustering to be done... + # print("SHARED") + # print(sharedloglikes) + # print("IND") + # print(individualloglikes) + # print("----") if loglikeimprovement[top[0],top[1]]>0: + #print(loglikeimprovement[top[0],top[1]]) active[top[0]].extend(active[top[1]]) - offset=pairoffset[top[0],top[1]] + offset=sharedoffset[top[0],top[1]] inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset]) data[top[0]] = np.hstack([data[top[0]],data[top[1]]]) del inputs[top[1]] @@ -159,12 +151,14 @@ def cluster(data,inputs,verbose=False): del active[top[1]] #None = message to say we need to recalculate - pairloglikes[:,top[0]] = None - pairloglikes[top[0],:] = None - pairloglikes = np.delete(pairloglikes,top[1],0) - pairloglikes = np.delete(pairloglikes,top[1],1) - loglikes[top[0]] = None - loglikes = np.delete(loglikes,top[1]) + sharedloglikes[:,top[0]] = None + sharedloglikes[top[0],:] = None + sharedloglikes = np.delete(sharedloglikes,top[1],0) + sharedloglikes = np.delete(sharedloglikes,top[1],1) + individualloglikes[:,top[0]] = None + individualloglikes[top[0],:] = None + individualloglikes = np.delete(individualloglikes,top[1],0) + individualloglikes = np.delete(individualloglikes,top[1],1) else: break