From c701fb9f093f47e319b37ae3c34d729c6c7b2437 Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Wed, 3 Aug 2016 14:55:23 +0100 Subject: [PATCH 01/15] Adding refs to new clustering and offset model and utility --- GPy/models/__init__.py | 2 ++ GPy/util/__init__.py | 1 + 2 files changed, 3 insertions(+) diff --git a/GPy/models/__init__.py b/GPy/models/__init__.py index 654b1938..6d19d8a5 100644 --- a/GPy/models/__init__.py +++ b/GPy/models/__init__.py @@ -26,3 +26,5 @@ from .dpgplvm import DPBayesianGPLVM from .state_space_model import StateSpace from .ibp_lfm import IBPLFM + +from .gp_offset_regression import GPOffsetRegression diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py index fb1eb0d6..685551fd 100644 --- a/GPy/util/__init__.py +++ b/GPy/util/__init__.py @@ -16,3 +16,4 @@ from . import initialization from . import multioutput from . import parallel from . import functions +from . import cluster_with_offset From 132c8d92b2badabeac3e6167f433e9bfb726641c Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Wed, 3 Aug 2016 15:27:49 +0100 Subject: [PATCH 02/15] Offset model and clustering utility --- GPy/models/gp_offset_regression.py | 92 +++++++++++++++ GPy/util/cluster_with_offset.py | 180 +++++++++++++++++++++++++++++ 2 files changed, 272 insertions(+) create mode 100644 GPy/models/gp_offset_regression.py create mode 100644 GPy/util/cluster_with_offset.py diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py new file mode 100644 index 00000000..3699223e --- /dev/null +++ b/GPy/models/gp_offset_regression.py @@ -0,0 +1,92 @@ +# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt) +# Licensed under the BSD 3-clause license (see LICENSE.txt) +# Written by Mike Smith. michaeltsmith.org.uk + +import numpy as np +from ..core import GP +from .. import likelihoods +from .. import kern +from ..core import Param + +class GPOffsetRegression(GP): + """ + Gaussian Process model for offset regression + + :param X: input observations, we assume for this class that this has one dimension of actual inputs and the last dimension should be the index of the cluster (so X should be Nx2) + :param Y: observed values (Nx1?) + :param kernel: a GPy kernel, defaults to rbf + :param Norm normalizer: [False] + :param noise_var: the noise variance for Gaussian likelhood, defaults to 1. + + Normalize Y with the norm given. + If normalizer is False, no normalization will be done + If it is None, we use GaussianNorm(alization) + + .. Note:: Multiple independent outputs are allowed using columns of Y + + """ + + def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1., mean_function=None): + + assert X.shape[1]>1, "Need at least two input dimensions, as last dimension is the label of the cluster" + if kernel is None: + kernel = kern.RBF(X.shape[1]-1) + + #self._log_marginal_likelihood = np.nan #todo + + likelihood = likelihoods.Gaussian(variance=noise_var) + self.X_fixed = X[:,:-1] + self.selected = np.array([int(x) for x in X[:,-1]]) + + + super(GPOffsetRegression, self).__init__(X, Y, kernel, likelihood, name='GP offset regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function) + maxcluster = np.max(self.selected) + self.offset = Param('offset', np.zeros(maxcluster)) + #self.offset.set_prior(...) + self.link_parameter(self.offset) + + #def dr_doffset(self, X, sel): #how much r changes wrt the offset hyperparameters + + #def dL_doffset(self, X, sel): + # dL_dr = self.dK_dr_via_X(X, X) * dL_dK + + + + def dr_doffset(self,X,sel,delta): + #given an input matrix, X and the offsets (delta) + #finds dr/dDelta + #returns them as a list, one for each offset (delta). + #get the input values + + #a matrix G represents the effect of increasing the offset on the radius passed to the kernel for each input. For example + #what effect will increasing offset 4 have on the kernel output of inputs 5 and 8? Answer: Gs[4][5,8]... (positive or negative) + Gs = [] + for i,d in enumerate(delta): + #X[sel==(i+1)]-=d + G = np.repeat(np.array(sel==(i+1))[:,None]*1,len(X),axis=1) - np.repeat(np.array(sel==(i+1))[None,:]*1,len(X),axis=0) + Gs.append(G) + #does subtracting the two Xs end up positive or negative (if negative we need to flip the sign in G). + w = np.repeat(X,len(X),axis=1) - np.repeat(X.T,len(X),axis=0) + dr_doffsets = [] + for i,d in enumerate(delta): + dr_doffset = np.sign(w * Gs[i]) + #print "dr_doffset %d" % i + #print dr_doffset + #print Gs[i] + #print w + dr_doffsets.append(dr_doffset) + return dr_doffsets + + def parameters_changed(self): + offsets = np.hstack([0.0,self.offset.values])[:,None] + + self.X = self.X_fixed - offsets[self.selected] + super(GPOffsetRegression, self).parameters_changed() + + dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK'] + + dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values) + for i in range(len(dr_doff)): + dL_doff = dL_dr * dr_doff[i] + self.offset.gradient[i] = -np.sum(dL_doff) + diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py new file mode 100644 index 00000000..103bdaca --- /dev/null +++ b/GPy/util/cluster_with_offset.py @@ -0,0 +1,180 @@ +# Copyright (c) 2016, Mike Smith +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import GPy +import numpy as np + +def get_log_likelihood(inputs,data,clust): + """Get the LL of a combined set of clusters, ignoring time series offsets. + + Get the log likelihood of a cluster without worrying about the fact + different time series are offset. We're using it here really for those + cases in which we only have one cluster to get the loglikelihood of. + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + clust -- list of clusters to use + + returns a tuple: + log likelihood and the offset (which is always zero for this model) + """ + + S = data[0].shape[0] #number of time series + + #build a new dataset from the clusters, by combining all clusters together + X = np.zeros([0,1]) + Y = np.zeros([0,S]) + + #for each person in the cluster, + #add their inputs and data to the new dataset + for p in clust: + X = np.vstack([X,inputs[p]]) + Y = np.vstack([Y,data[p].T]) + + #find the loglikelihood. We just add together the LL for each time series. + #ll=0 + #for s in range(S): + # m = GPy.models.GPRegression(X,Y[:,s][:,None]) + # m.optimize() + # ll+=m.log_likelihood() + + m = GPy.models.GPRegression(X,Y) + m.optimize() + ll=m.log_likelihood() + return ll,0 + +def get_log_likelihood_offset(inputs,data,clust): + """Get the log likelihood of a combined set of clusters, fitting the offsets + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + clust -- list of clusters to use + + returns a tuple: + log likelihood and the offset + """ + + #if we've only got one cluster, the model has an error, so we want to just + #use normal GPRegression. + if len(clust)==1: + return get_log_likelihood(inputs,data,clust) + + S = data[0].shape[0] #number of time series + + X = np.zeros([0,2]) #notice the extra column, this is for the cluster index + Y = np.zeros([0,S]) + + #for each person in the cluster, add their inputs and data to the new + #dataset. Note we add an index identifying which person is which data point. + #This is for the offset model to use, to allow it to know which data points + #to shift. + for i,p in enumerate(clust): + idx = i*np.ones([inputs[p].shape[0],1]) + X = np.vstack([X,np.hstack([inputs[p],idx])]) + Y = np.vstack([Y,data[p].T]) + + m = GPy.models.GPOffsetRegression(X,Y) + #TODO: How to select a sensible prior? + m.offset.set_prior(GPy.priors.Gaussian(0,20)) + #TODO: Set a sensible start value for the length scale, + #make it long to help the offset fit. + + m.optimize() + + ll = m.log_likelihood() + offset = m.offset.values[0] + return ll,offset + +def cluster(data,inputs,verbose=False): + """Clusters data + + Using the new offset model, this method uses a greedy algorithm to cluster + the data. It starts with all the data points in separate clusters and tests + whether combining them increases the overall log-likelihood (LL). It then + iteratively joins pairs of clusters which cause the greatest increase in + the LL, until no join increases the LL. + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + + returns a list of the clusters. + """ + N=len(data) + + + #Define a set of N active cluster + active = [] + for p in range(0,N): + active.append([p]) + + loglikes = np.zeros(len(active)) + loglikes[:] = None + + pairloglikes = np.zeros([len(active),len(active)]) + pairloglikes[:] = None + pairoffset = np.zeros([len(active),len(active)]) + + it = 0 + while True: + + if verbose: + it +=1 + print "Iteration %d" % it + + #Compute the log-likelihood of each cluster (add them together) + for clusti in range(len(active)): + if np.isnan(loglikes[clusti]): + loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti]) + + #try combining with each other cluster... + for clustj in range(clusti): #count from 0 to clustj-1 + temp = [clusti,clustj] + if np.isnan(pairloglikes[clusti,clustj]): + pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp) + + seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1) + loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering + top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape) + + #if loglikeimprovement.shape[0]<3: + # #no more clustering to do - this shouldn't happen really unless + # #we've set the threshold to apply clustering to less than 0 + # break + + #if theres further clustering to be done... + if loglikeimprovement[top[0],top[1]]>0: + active[top[0]].extend(active[top[1]]) + offset=pairoffset[top[0],top[1]] + inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset]) + data[top[0]] = np.hstack([data[top[0]],data[top[1]]]) + del inputs[top[1]] + del data[top[1]] + del active[top[1]] + + #None = message to say we need to recalculate + pairloglikes[:,top[0]] = None + pairloglikes[top[0],:] = None + pairloglikes = np.delete(pairloglikes,top[1],0) + pairloglikes = np.delete(pairloglikes,top[1],1) + loglikes[top[0]] = None + loglikes = np.delete(loglikes,top[1]) + else: + break + + #if loglikeimprovement[top[0],top[1]]>0: + # print "joined" + # print top + # print offset + # print offsets + # print offsets[top[1]]-offsets[top[0]] + + #TODO Add a way to return the offsets applied to all the time series + return active + +#starttime = time.time() +#active = cluster(data,inputs) +#endtime = time.time() +#print "TOTAL TIME %0.4f" % (endtime-starttime) From ea8b7321815ea07ef0a1688668ed03c6b55514f0 Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Wed, 3 Aug 2016 17:49:01 +0100 Subject: [PATCH 03/15] Corrected v2 missing print brackets. Added test code for new model and util --- GPy/testing/model_tests.py | 18 ++++++++++++ GPy/testing/util_tests.py | 49 +++++++++++++++++++++++++++++++++ GPy/util/cluster_with_offset.py | 2 +- 3 files changed, 68 insertions(+), 1 deletion(-) diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py index e4411e23..ff137299 100644 --- a/GPy/testing/model_tests.py +++ b/GPy/testing/model_tests.py @@ -405,6 +405,24 @@ class MiscTests(unittest.TestCase): warp_m.plot() warp_f.plot(X.min()-10, X.max()+10) plt.show() + + def test_offset_regression(self): + #Tests GPy.models.GPOffsetRegression. Using two small time series + #from a sine wave, we confirm the algorithm determines that the + #likelihood is maximised when the offset hyperparameter is approximately + #equal to the actual offset in X between the two time series. + offset = 3 + X1 = np.arange(0,50,5.0)[:,None] + X2 = np.arange(0+offset,50+offset,5.0)[:,None] + X = np.vstack([X1,X2]) + ind = np.vstack([np.zeros([10,1]),np.ones([10,1])]) + X = np.hstack([X,ind]) + Y = np.sin((X[0:10,0])/30.0)[:,None] + Y = np.vstack([Y,Y]) + + m = GPy.models.GPOffsetRegression(X,Y) + m.optimize() + assert np.abs(m.offset[0]-offset)<0.1, "GPOffsetRegression model failing to estimate correct offset." diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py index 3c6241f3..e82f7c1a 100644 --- a/GPy/testing/util_tests.py +++ b/GPy/testing/util_tests.py @@ -107,3 +107,52 @@ class TestDebug(unittest.TestCase): self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5]) self.assertTrue(d[tuple(X[:,1])] == [1]) + def test_offset_cluster(self): + #Tests the GPy.util.cluster_with_offset.cluster utility with a small + #test data set. Not using random noise just in case it occasionally + #causes it not to cluster correctly. + #groundtruth cluster identifiers are: [0,1,1,0] + + #data contains a list of the four sets of time series (3 per data point) + + data = [np.array([[ 2.18094245, 1.96529789, 2.00265523, 2.18218742, 2.06795428], + [ 1.62254829, 1.75748448, 1.83879347, 1.87531326, 1.52503496], + [ 1.54589609, 1.61607914, 2.00463192, 1.48771394, 1.63339218]]), + np.array([[ 2.86766106, 2.97953437, 2.91958876, 2.92510506, 3.03239241], + [ 2.57368423, 2.59954886, 3.10000395, 2.75806125, 2.89865704], + [ 2.58916318, 2.53698259, 2.63858411, 2.63102504, 2.51853901]]), + np.array([[ 2.77834168, 2.9618564 , 2.88482141, 3.24259745, 2.9716821 ], + [ 2.60675576, 2.67095624, 2.94824436, 2.80520631, 2.87247516], + [ 2.49543562, 2.5492281 , 2.6505866 , 2.65015308, 2.59738616]]), + np.array([[ 1.76783086, 2.21666738, 2.07939706, 1.9268263 , 2.23360121], + [ 1.94305547, 1.94648592, 2.1278921 , 2.09481457, 2.08575238], + [ 1.69336013, 1.72285186, 1.6339506 , 1.61212022, 1.39198698]])] + + #inputs contains their associated X values + + inputs = [np.array([[ 0. ], + [ 0.68040097], + [ 1.20316795], + [ 1.798749 ], + [ 2.14891733]]), np.array([[ 0. ], + [ 0.51910637], + [ 0.98259352], + [ 1.57442965], + [ 1.82515098]]), np.array([[ 0. ], + [ 0.66645478], + [ 1.59464591], + [ 1.69769551], + [ 1.80932752]]), np.array([[ 0. ], + [ 0.87512108], + [ 1.71881079], + [ 2.67162871], + [ 3.23761907]])] + + #try doing the clustering + active = GPy.util.cluster_with_offset.cluster(data,inputs) + + #check to see that the clustering has correctly clustered the time series. + from sets import Set + clusters = Set([Set(cluster) for cluster in active]) + assert Set([1,2]) in clusters, "Offset Clustering algorithm failed" + assert Set([0,3]) in clusters, "Offset Clustering algoirthm failed" diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py index 103bdaca..fa8f1be8 100644 --- a/GPy/util/cluster_with_offset.py +++ b/GPy/util/cluster_with_offset.py @@ -122,7 +122,7 @@ def cluster(data,inputs,verbose=False): if verbose: it +=1 - print "Iteration %d" % it + print("Iteration %d" % it) #Compute the log-likelihood of each cluster (add them together) for clusti in range(len(active)): From 16bc77a1edeeadeb377f794707e4752a4827630e Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Wed, 3 Aug 2016 18:12:56 +0100 Subject: [PATCH 04/15] More useful message from testing re offset estimate --- GPy/testing/model_tests.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py index ff137299..e2c05aec 100644 --- a/GPy/testing/model_tests.py +++ b/GPy/testing/model_tests.py @@ -421,8 +421,9 @@ class MiscTests(unittest.TestCase): Y = np.vstack([Y,Y]) m = GPy.models.GPOffsetRegression(X,Y) + assert m.checkgrad(), "Gradients of offset parameters don't match numerical approximations." m.optimize() - assert np.abs(m.offset[0]-offset)<0.1, "GPOffsetRegression model failing to estimate correct offset." + assert np.abs(m.offset[0]-offset)<0.1, ("GPOffsetRegression model failing to estimate correct offset (value estimated = %0.2f instead of %0.2f)" % (m.offset[0], offset)) From 04bcf56de12ed0da215a12e4d4edb450ac216dea Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Wed, 3 Aug 2016 18:28:08 +0100 Subject: [PATCH 05/15] Added GPy import --- GPy/testing/util_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py index e82f7c1a..ce0beaaa 100644 --- a/GPy/testing/util_tests.py +++ b/GPy/testing/util_tests.py @@ -29,6 +29,7 @@ #=============================================================================== import unittest, numpy as np +import GPy class TestDebug(unittest.TestCase): def test_checkFinite(self): From ebe4a496a65d27ea8cb0289e1008ffac17ab3c98 Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Thu, 4 Aug 2016 09:01:27 +0100 Subject: [PATCH 06/15] Corrected mistake in gradients: Was finding d(Xi-Xj)/dOffset instead of dr/dOffset. Fixed by scaling by kernel lengthscale. --- GPy/models/gp_offset_regression.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py index 3699223e..2fd264f4 100644 --- a/GPy/models/gp_offset_regression.py +++ b/GPy/models/gp_offset_regression.py @@ -75,6 +75,10 @@ class GPOffsetRegression(GP): #print Gs[i] #print w dr_doffsets.append(dr_doffset) + + #lastly we need to divide by the lengthscale: So far we've found d(X_i - X_j)/dOffsets + #we want dr/dOffsets. (X_i - X_j)/lengthscale = r + dr_doffsets /= self.kern.lengthscale return dr_doffsets def parameters_changed(self): From 97f5ca6b8439fca8f365e4d9e5f23717f5cd4624 Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Thu, 4 Aug 2016 09:26:40 +0100 Subject: [PATCH 07/15] Modified set code in test to work with python 2 and python 3. --- GPy/testing/util_tests.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/GPy/testing/util_tests.py b/GPy/testing/util_tests.py index ce0beaaa..ba3d7ddf 100644 --- a/GPy/testing/util_tests.py +++ b/GPy/testing/util_tests.py @@ -148,12 +148,10 @@ class TestDebug(unittest.TestCase): [ 1.71881079], [ 2.67162871], [ 3.23761907]])] - + #try doing the clustering active = GPy.util.cluster_with_offset.cluster(data,inputs) - #check to see that the clustering has correctly clustered the time series. - from sets import Set - clusters = Set([Set(cluster) for cluster in active]) - assert Set([1,2]) in clusters, "Offset Clustering algorithm failed" - assert Set([0,3]) in clusters, "Offset Clustering algoirthm failed" + clusters = set([frozenset(cluster) for cluster in active]) + assert set([1,2]) in clusters, "Offset Clustering algorithm failed" + assert set([0,3]) in clusters, "Offset Clustering algoirthm failed" From ad2779bdf3547b13656daed67b66284cb5652614 Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Thu, 4 Aug 2016 10:11:30 +0100 Subject: [PATCH 08/15] made initial lengthscale!=1 to ensure we're properly testing gradients --- GPy/testing/model_tests.py | 1 + 1 file changed, 1 insertion(+) diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py index e2c05aec..5d6de7ae 100644 --- a/GPy/testing/model_tests.py +++ b/GPy/testing/model_tests.py @@ -421,6 +421,7 @@ class MiscTests(unittest.TestCase): Y = np.vstack([Y,Y]) m = GPy.models.GPOffsetRegression(X,Y) + m.rbf.lengthscale=5.0 #make it something other than one to check our gradients properly! assert m.checkgrad(), "Gradients of offset parameters don't match numerical approximations." m.optimize() assert np.abs(m.offset[0]-offset)<0.1, ("GPOffsetRegression model failing to estimate correct offset (value estimated = %0.2f instead of %0.2f)" % (m.offset[0], offset)) From e019b17cf965a7f673326140767dbc48c4332e32 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 4 Aug 2016 12:31:36 +0000 Subject: [PATCH 09/15] Added threaded option - but this doesn't work due to the global interpreter lock --- GPy/util/__init__.py | 1 + GPy/util/cluster_with_offset.py | 4 + GPy/util/threaded_cluster_with_offset.py | 206 +++++++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 GPy/util/threaded_cluster_with_offset.py diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py index 685551fd..136e0d3b 100644 --- a/GPy/util/__init__.py +++ b/GPy/util/__init__.py @@ -17,3 +17,4 @@ from . import multioutput from . import parallel from . import functions from . import cluster_with_offset +from . import threaded_cluster_with_offset diff --git a/GPy/util/cluster_with_offset.py b/GPy/util/cluster_with_offset.py index fa8f1be8..d1e896f1 100644 --- a/GPy/util/cluster_with_offset.py +++ b/GPy/util/cluster_with_offset.py @@ -3,6 +3,7 @@ import GPy import numpy as np +import sys #so I can print dots def get_log_likelihood(inputs,data,clust): """Get the LL of a combined set of clusters, ignoring time series offsets. @@ -126,6 +127,9 @@ def cluster(data,inputs,verbose=False): #Compute the log-likelihood of each cluster (add them together) for clusti in range(len(active)): + if verbose: + sys.stdout.write('.') + sys.stdout.flush() if np.isnan(loglikes[clusti]): loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti]) diff --git a/GPy/util/threaded_cluster_with_offset.py b/GPy/util/threaded_cluster_with_offset.py new file mode 100644 index 00000000..66192c9f --- /dev/null +++ b/GPy/util/threaded_cluster_with_offset.py @@ -0,0 +1,206 @@ +# Copyright (c) 2016, Mike Smith +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import GPy +import numpy as np +import time +import sys #so I can print dots +from threading import Thread +maxthreads = 40 + + +def get_log_likelihood(inputs,data,clust,loglikesarray): + """Get the LL of a combined set of clusters, ignoring time series offsets. + + Get the log likelihood of a cluster without worrying about the fact + different time series are offset. We're using it here really for those + cases in which we only have one cluster to get the loglikelihood of. + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + clust -- list of clusters to use + + returns a tuple: + log likelihood and the offset (which is always zero for this model) + """ + + S = data[0].shape[0] #number of time series + + #build a new dataset from the clusters, by combining all clusters together + X = np.zeros([0,1]) + Y = np.zeros([0,S]) + + #for each person in the cluster, + #add their inputs and data to the new dataset + for p in clust: + X = np.vstack([X,inputs[p]]) + Y = np.vstack([Y,data[p].T]) + + #find the loglikelihood. We just add together the LL for each time series. + #ll=0 + #for s in range(S): + # m = GPy.models.GPRegression(X,Y[:,s][:,None]) + # m.optimize() + # ll+=m.log_likelihood() + + m = GPy.models.GPRegression(X,Y) + m.optimize() + ll=m.log_likelihood() + + loglikesarray[clust[0]] = ll + return + +def get_log_likelihood_offset(inputs,data,clust,loglikesarray,offsetarray): + """Get the log likelihood of a combined set of clusters, fitting the offsets + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + clust -- list of clusters to use + + returns a tuple: + log likelihood and the offset + """ + + assert len(clust)>1, "Use get_log_likelihood if you only have one cluster" + + + + S = data[0].shape[0] #number of time series + + X = np.zeros([0,2]) #notice the extra column, this is for the cluster index + Y = np.zeros([0,S]) + + #for each person in the cluster, add their inputs and data to the new + #dataset. Note we add an index identifying which person is which data point. + #This is for the offset model to use, to allow it to know which data points + #to shift. + for i,p in enumerate(clust): + idx = i*np.ones([inputs[p].shape[0],1]) + X = np.vstack([X,np.hstack([inputs[p],idx])]) + Y = np.vstack([Y,data[p].T]) + + m = GPy.models.GPOffsetRegression(X,Y) + #TODO: How to select a sensible prior? + m.offset.set_prior(GPy.priors.Gaussian(0,20)) + #TODO: Set a sensible start value for the length scale, + #make it long to help the offset fit. + + m.optimize() + + ll = m.log_likelihood() + offset = m.offset.values[0] + #return ll,offset + loglikesarray[clust[0],clust[1]] = ll + offsetarray[clust[0],clust[1]] = offset + return + +def cluster(data,inputs,verbose=False): + """Clusters data + + Using the new offset model, this method uses a greedy algorithm to cluster + the data. It starts with all the data points in separate clusters and tests + whether combining them increases the overall log-likelihood (LL). It then + iteratively joins pairs of clusters which cause the greatest increase in + the LL, until no join increases the LL. + + arguments: + inputs -- the 'X's in a list, one item per cluster + data -- the 'Y's in a list, one item per cluster + + returns a list of the clusters. + """ + N=len(data) + + + #Define a set of N active cluster + active = [] + for p in range(0,N): + active.append([p]) + + loglikes = np.zeros(len(active)) + loglikes[:] = None + + pairloglikes = np.zeros([len(active),len(active)]) + pairloglikes[:] = None + pairoffset = np.zeros([len(active),len(active)]) + + it = 0 + while True: + + if verbose: + it +=1 + print("Iteration %d" % it) + + threads = [] + #Compute the log-likelihood of each cluster (add them together) + for clusti in range(len(active)): + if verbose: + sys.stdout.write('.') + sys.stdout.flush() + if np.isnan(loglikes[clusti]): + t = Thread(target=get_log_likelihood,args=(inputs,data,[clusti],loglikes)) + threads.append(t) + #loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti]) + + #try combining with each other cluster... + for clustj in range(clusti): #count from 0 to clustj-1 + temp = [clusti,clustj] + if np.isnan(pairloglikes[clusti,clustj]): + #pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp) + t = Thread(target=get_log_likelihood_offset,args=(inputs,data,temp,pairloglikes,pairoffset)) + threads.append(t) + + if len(threads)<=maxthreads: + for t in threads: + t.start() + for t in threads: + t.join() + + else: #should use a queue, as the method here assumes all threads take about same time, etc. + for i,t in enumerate(threads): + t.start() + if (i>=maxthreads): + threads[i-maxthreads].join() + for i in range(len(threads)-maxthreads,len(threads)): + threads[i].join() + + seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1) + loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering + top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape) + + #if loglikeimprovement.shape[0]<3: + # #no more clustering to do - this shouldn't happen really unless + # #we've set the threshold to apply clustering to less than 0 + # break + + #if theres further clustering to be done... + if loglikeimprovement[top[0],top[1]]>0: + active[top[0]].extend(active[top[1]]) + offset=pairoffset[top[0],top[1]] + inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset]) + data[top[0]] = np.hstack([data[top[0]],data[top[1]]]) + del inputs[top[1]] + del data[top[1]] + del active[top[1]] + + #None = message to say we need to recalculate + pairloglikes[:,top[0]] = None + pairloglikes[top[0],:] = None + pairloglikes = np.delete(pairloglikes,top[1],0) + pairloglikes = np.delete(pairloglikes,top[1],1) + loglikes[top[0]] = None + loglikes = np.delete(loglikes,top[1]) + else: + break + + #if loglikeimprovement[top[0],top[1]]>0: + # print "joined" + # print top + # print offset + # print offsets + # print offsets[top[1]]-offsets[top[0]] + + #TODO Add a way to return the offsets applied to all the time series + return active From 6f4d03178ebdc1967d5aeefbb1ca7ca5ab3923b3 Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Thu, 4 Aug 2016 14:47:08 +0100 Subject: [PATCH 10/15] Don't use message added to cluster code --- GPy/util/threaded_cluster_with_offset.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/GPy/util/threaded_cluster_with_offset.py b/GPy/util/threaded_cluster_with_offset.py index 66192c9f..6818f5ec 100644 --- a/GPy/util/threaded_cluster_with_offset.py +++ b/GPy/util/threaded_cluster_with_offset.py @@ -1,6 +1,9 @@ # Copyright (c) 2016, Mike Smith # Licensed under the BSD 3-clause license (see LICENSE.txt) +# Not recommended for use at the moment - the threading here doesn't +# work due to the global interpreter lock. + import GPy import numpy as np import time From 81dc680a8acb72b46f16bc12d599f847680a3a98 Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Tue, 9 Aug 2016 13:55:37 +0100 Subject: [PATCH 11/15] Push just to rerun testing --- GPy/models/gp_offset_regression.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/GPy/models/gp_offset_regression.py b/GPy/models/gp_offset_regression.py index 2fd264f4..b843e08d 100644 --- a/GPy/models/gp_offset_regression.py +++ b/GPy/models/gp_offset_regression.py @@ -49,8 +49,7 @@ class GPOffsetRegression(GP): #def dL_doffset(self, X, sel): # dL_dr = self.dK_dr_via_X(X, X) * dL_dK - - + def dr_doffset(self,X,sel,delta): #given an input matrix, X and the offsets (delta) From bd758ac849eff3f13f58c61b7a41f0fc1f1a395f Mon Sep 17 00:00:00 2001 From: Michael T Smith Date: Tue, 9 Aug 2016 14:54:29 +0100 Subject: [PATCH 12/15] Removing 'threaded' version --- GPy/util/__init__.py | 1 - GPy/util/threaded_cluster_with_offset.py | 209 ----------------------- 2 files changed, 210 deletions(-) delete mode 100644 GPy/util/threaded_cluster_with_offset.py diff --git a/GPy/util/__init__.py b/GPy/util/__init__.py index 136e0d3b..685551fd 100644 --- a/GPy/util/__init__.py +++ b/GPy/util/__init__.py @@ -17,4 +17,3 @@ from . import multioutput from . import parallel from . import functions from . import cluster_with_offset -from . import threaded_cluster_with_offset diff --git a/GPy/util/threaded_cluster_with_offset.py b/GPy/util/threaded_cluster_with_offset.py deleted file mode 100644 index 6818f5ec..00000000 --- a/GPy/util/threaded_cluster_with_offset.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2016, Mike Smith -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -# Not recommended for use at the moment - the threading here doesn't -# work due to the global interpreter lock. - -import GPy -import numpy as np -import time -import sys #so I can print dots -from threading import Thread -maxthreads = 40 - - -def get_log_likelihood(inputs,data,clust,loglikesarray): - """Get the LL of a combined set of clusters, ignoring time series offsets. - - Get the log likelihood of a cluster without worrying about the fact - different time series are offset. We're using it here really for those - cases in which we only have one cluster to get the loglikelihood of. - - arguments: - inputs -- the 'X's in a list, one item per cluster - data -- the 'Y's in a list, one item per cluster - clust -- list of clusters to use - - returns a tuple: - log likelihood and the offset (which is always zero for this model) - """ - - S = data[0].shape[0] #number of time series - - #build a new dataset from the clusters, by combining all clusters together - X = np.zeros([0,1]) - Y = np.zeros([0,S]) - - #for each person in the cluster, - #add their inputs and data to the new dataset - for p in clust: - X = np.vstack([X,inputs[p]]) - Y = np.vstack([Y,data[p].T]) - - #find the loglikelihood. We just add together the LL for each time series. - #ll=0 - #for s in range(S): - # m = GPy.models.GPRegression(X,Y[:,s][:,None]) - # m.optimize() - # ll+=m.log_likelihood() - - m = GPy.models.GPRegression(X,Y) - m.optimize() - ll=m.log_likelihood() - - loglikesarray[clust[0]] = ll - return - -def get_log_likelihood_offset(inputs,data,clust,loglikesarray,offsetarray): - """Get the log likelihood of a combined set of clusters, fitting the offsets - - arguments: - inputs -- the 'X's in a list, one item per cluster - data -- the 'Y's in a list, one item per cluster - clust -- list of clusters to use - - returns a tuple: - log likelihood and the offset - """ - - assert len(clust)>1, "Use get_log_likelihood if you only have one cluster" - - - - S = data[0].shape[0] #number of time series - - X = np.zeros([0,2]) #notice the extra column, this is for the cluster index - Y = np.zeros([0,S]) - - #for each person in the cluster, add their inputs and data to the new - #dataset. Note we add an index identifying which person is which data point. - #This is for the offset model to use, to allow it to know which data points - #to shift. - for i,p in enumerate(clust): - idx = i*np.ones([inputs[p].shape[0],1]) - X = np.vstack([X,np.hstack([inputs[p],idx])]) - Y = np.vstack([Y,data[p].T]) - - m = GPy.models.GPOffsetRegression(X,Y) - #TODO: How to select a sensible prior? - m.offset.set_prior(GPy.priors.Gaussian(0,20)) - #TODO: Set a sensible start value for the length scale, - #make it long to help the offset fit. - - m.optimize() - - ll = m.log_likelihood() - offset = m.offset.values[0] - #return ll,offset - loglikesarray[clust[0],clust[1]] = ll - offsetarray[clust[0],clust[1]] = offset - return - -def cluster(data,inputs,verbose=False): - """Clusters data - - Using the new offset model, this method uses a greedy algorithm to cluster - the data. It starts with all the data points in separate clusters and tests - whether combining them increases the overall log-likelihood (LL). It then - iteratively joins pairs of clusters which cause the greatest increase in - the LL, until no join increases the LL. - - arguments: - inputs -- the 'X's in a list, one item per cluster - data -- the 'Y's in a list, one item per cluster - - returns a list of the clusters. - """ - N=len(data) - - - #Define a set of N active cluster - active = [] - for p in range(0,N): - active.append([p]) - - loglikes = np.zeros(len(active)) - loglikes[:] = None - - pairloglikes = np.zeros([len(active),len(active)]) - pairloglikes[:] = None - pairoffset = np.zeros([len(active),len(active)]) - - it = 0 - while True: - - if verbose: - it +=1 - print("Iteration %d" % it) - - threads = [] - #Compute the log-likelihood of each cluster (add them together) - for clusti in range(len(active)): - if verbose: - sys.stdout.write('.') - sys.stdout.flush() - if np.isnan(loglikes[clusti]): - t = Thread(target=get_log_likelihood,args=(inputs,data,[clusti],loglikes)) - threads.append(t) - #loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti]) - - #try combining with each other cluster... - for clustj in range(clusti): #count from 0 to clustj-1 - temp = [clusti,clustj] - if np.isnan(pairloglikes[clusti,clustj]): - #pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp) - t = Thread(target=get_log_likelihood_offset,args=(inputs,data,temp,pairloglikes,pairoffset)) - threads.append(t) - - if len(threads)<=maxthreads: - for t in threads: - t.start() - for t in threads: - t.join() - - else: #should use a queue, as the method here assumes all threads take about same time, etc. - for i,t in enumerate(threads): - t.start() - if (i>=maxthreads): - threads[i-maxthreads].join() - for i in range(len(threads)-maxthreads,len(threads)): - threads[i].join() - - seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1) - loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering - top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape) - - #if loglikeimprovement.shape[0]<3: - # #no more clustering to do - this shouldn't happen really unless - # #we've set the threshold to apply clustering to less than 0 - # break - - #if theres further clustering to be done... - if loglikeimprovement[top[0],top[1]]>0: - active[top[0]].extend(active[top[1]]) - offset=pairoffset[top[0],top[1]] - inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset]) - data[top[0]] = np.hstack([data[top[0]],data[top[1]]]) - del inputs[top[1]] - del data[top[1]] - del active[top[1]] - - #None = message to say we need to recalculate - pairloglikes[:,top[0]] = None - pairloglikes[top[0],:] = None - pairloglikes = np.delete(pairloglikes,top[1],0) - pairloglikes = np.delete(pairloglikes,top[1],1) - loglikes[top[0]] = None - loglikes = np.delete(loglikes,top[1]) - else: - break - - #if loglikeimprovement[top[0],top[1]]>0: - # print "joined" - # print top - # print offset - # print offsets - # print offsets[top[1]]-offsets[top[0]] - - #TODO Add a way to return the offsets applied to all the time series - return active From 7135527fd00f0a385b65bfb32f9535c3859d1f8a Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 16 Aug 2016 11:17:15 +0100 Subject: [PATCH 13/15] [Add] add kernel swallowed parts fix #412 --- GPy/kern/src/add.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/GPy/kern/src/add.py b/GPy/kern/src/add.py index 99fe809b..ca20e5a9 100644 --- a/GPy/kern/src/add.py +++ b/GPy/kern/src/add.py @@ -13,15 +13,21 @@ class Add(CombinationKernel): propagates gradients through. This kernel will take over the active dims of it's subkernels passed in. + + NOTE: The subkernels will be copies of the original kernels, to prevent + unexpected behavior. """ def __init__(self, subkerns, name='sum'): - for i, kern in enumerate(subkerns[:]): + _newkerns = [] + for kern in subkerns: if isinstance(kern, Add): - del subkerns[i] - for part in kern.parts[::-1]: + for part in kern.parts: #kern.unlink_parameter(part) - subkerns.insert(i, part.copy()) - super(Add, self).__init__(subkerns, name) + _newkerns.append(part.copy()) + else: + _newkerns.append(kern.copy()) + + super(Add, self).__init__(_newkerns, name) self._exact_psicomp = self._check_exact_psicomp() def _check_exact_psicomp(self): From 9aa19b662d7d2f4ca52df428b4b060a1d12425d5 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 16 Aug 2016 11:19:04 +0100 Subject: [PATCH 14/15] [readme] added deploy status to readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e0e715a9..513f2704 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ The Gaussian processes framework in Python. * Travis-CI [unit-tests](https://travis-ci.org/SheffieldML/GPy) * [![licence](https://img.shields.io/badge/licence-BSD-blue.svg)](http://opensource.org/licenses/BSD-3-Clause) -[![develstat](https://travis-ci.org/SheffieldML/GPy.svg?branch=devel)](https://travis-ci.org/SheffieldML/GPy) [![appveyor](https://ci.appveyor.com/api/projects/status/662o6tha09m2jix3/branch/deploy?svg=true)](https://ci.appveyor.com/project/mzwiessele/gpy/branch/deploy) [![coverallsdevel](https://coveralls.io/repos/github/SheffieldML/GPy/badge.svg?branch=devel)](https://coveralls.io/github/SheffieldML/GPy?branch=devel) [![covdevel](http://codecov.io/github/SheffieldML/GPy/coverage.svg?branch=devel)](http://codecov.io/github/SheffieldML/GPy?branch=devel) [![Research software impact](http://depsy.org/api/package/pypi/GPy/badge.svg)](http://depsy.org/package/python/GPy) [![Code Health](https://landscape.io/github/SheffieldML/GPy/devel/landscape.svg?style=flat)](https://landscape.io/github/SheffieldML/GPy/devel) +[![deploystat](https://travis-ci.org/SheffieldML/GPy.svg?branch=deploy)](https://travis-ci.org/SheffieldML/GPy) [![appveyor](https://ci.appveyor.com/api/projects/status/662o6tha09m2jix3/branch/deploy?svg=true)](https://ci.appveyor.com/project/mzwiessele/gpy/branch/deploy) [![coverallsdevel](https://coveralls.io/repos/github/SheffieldML/GPy/badge.svg?branch=devel)](https://coveralls.io/github/SheffieldML/GPy?branch=devel) [![covdevel](http://codecov.io/github/SheffieldML/GPy/coverage.svg?branch=devel)](http://codecov.io/github/SheffieldML/GPy?branch=devel) [![Research software impact](http://depsy.org/api/package/pypi/GPy/badge.svg)](http://depsy.org/package/python/GPy) [![Code Health](https://landscape.io/github/SheffieldML/GPy/devel/landscape.svg?style=flat)](https://landscape.io/github/SheffieldML/GPy/devel) ## Updated Structure From b6847366120ef1b4ad7232c8fb80ed26ad448802 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 16 Aug 2016 12:01:17 +0100 Subject: [PATCH 15/15] [plotting] small edit --- GPy/plotting/abstract_plotting_library.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/GPy/plotting/abstract_plotting_library.py b/GPy/plotting/abstract_plotting_library.py index 64061b99..95377ed7 100644 --- a/GPy/plotting/abstract_plotting_library.py +++ b/GPy/plotting/abstract_plotting_library.py @@ -61,6 +61,8 @@ class AbstractPlottingLibrary(object): """ Get a new figure with nrows and ncolumns subplots. Does not initialize the canvases yet. + + There is individual kwargs for the individual plotting libraries to use. """ raise NotImplementedError("Implement all plot functions in AbstractPlottingLibrary in order to use your own plotting library")