[merge] devel

This commit is contained in:
mzwiessele 2016-08-16 12:03:45 +01:00
commit 746d4daae8
8 changed files with 356 additions and 3 deletions

View file

@ -26,3 +26,5 @@ from .dpgplvm import DPBayesianGPLVM
from .state_space_model import StateSpace
from .ibp_lfm import IBPLFM
from .gp_offset_regression import GPOffsetRegression

View file

@ -0,0 +1,95 @@
# Copyright (c) 2012 - 2014 the GPy Austhors (see AUTHORS.txt)
# Licensed under the BSD 3-clause license (see LICENSE.txt)
# Written by Mike Smith. michaeltsmith.org.uk
import numpy as np
from ..core import GP
from .. import likelihoods
from .. import kern
from ..core import Param
class GPOffsetRegression(GP):
"""
Gaussian Process model for offset regression
:param X: input observations, we assume for this class that this has one dimension of actual inputs and the last dimension should be the index of the cluster (so X should be Nx2)
:param Y: observed values (Nx1?)
:param kernel: a GPy kernel, defaults to rbf
:param Norm normalizer: [False]
:param noise_var: the noise variance for Gaussian likelhood, defaults to 1.
Normalize Y with the norm given.
If normalizer is False, no normalization will be done
If it is None, we use GaussianNorm(alization)
.. Note:: Multiple independent outputs are allowed using columns of Y
"""
def __init__(self, X, Y, kernel=None, Y_metadata=None, normalizer=None, noise_var=1., mean_function=None):
assert X.shape[1]>1, "Need at least two input dimensions, as last dimension is the label of the cluster"
if kernel is None:
kernel = kern.RBF(X.shape[1]-1)
#self._log_marginal_likelihood = np.nan #todo
likelihood = likelihoods.Gaussian(variance=noise_var)
self.X_fixed = X[:,:-1]
self.selected = np.array([int(x) for x in X[:,-1]])
super(GPOffsetRegression, self).__init__(X, Y, kernel, likelihood, name='GP offset regression', Y_metadata=Y_metadata, normalizer=normalizer, mean_function=mean_function)
maxcluster = np.max(self.selected)
self.offset = Param('offset', np.zeros(maxcluster))
#self.offset.set_prior(...)
self.link_parameter(self.offset)
#def dr_doffset(self, X, sel): #how much r changes wrt the offset hyperparameters
#def dL_doffset(self, X, sel):
# dL_dr = self.dK_dr_via_X(X, X) * dL_dK
def dr_doffset(self,X,sel,delta):
#given an input matrix, X and the offsets (delta)
#finds dr/dDelta
#returns them as a list, one for each offset (delta).
#get the input values
#a matrix G represents the effect of increasing the offset on the radius passed to the kernel for each input. For example
#what effect will increasing offset 4 have on the kernel output of inputs 5 and 8? Answer: Gs[4][5,8]... (positive or negative)
Gs = []
for i,d in enumerate(delta):
#X[sel==(i+1)]-=d
G = np.repeat(np.array(sel==(i+1))[:,None]*1,len(X),axis=1) - np.repeat(np.array(sel==(i+1))[None,:]*1,len(X),axis=0)
Gs.append(G)
#does subtracting the two Xs end up positive or negative (if negative we need to flip the sign in G).
w = np.repeat(X,len(X),axis=1) - np.repeat(X.T,len(X),axis=0)
dr_doffsets = []
for i,d in enumerate(delta):
dr_doffset = np.sign(w * Gs[i])
#print "dr_doffset %d" % i
#print dr_doffset
#print Gs[i]
#print w
dr_doffsets.append(dr_doffset)
#lastly we need to divide by the lengthscale: So far we've found d(X_i - X_j)/dOffsets
#we want dr/dOffsets. (X_i - X_j)/lengthscale = r
dr_doffsets /= self.kern.lengthscale
return dr_doffsets
def parameters_changed(self):
offsets = np.hstack([0.0,self.offset.values])[:,None]
self.X = self.X_fixed - offsets[self.selected]
super(GPOffsetRegression, self).parameters_changed()
dL_dr = self.kern.dK_dr_via_X(self.X, self.X) * self.grad_dict['dL_dK']
dr_doff = self.dr_doffset(self.X,self.selected,self.offset.values)
for i in range(len(dr_doff)):
dL_doff = dL_dr * dr_doff[i]
self.offset.gradient[i] = -np.sum(dL_doff)

View file

@ -61,6 +61,8 @@ class AbstractPlottingLibrary(object):
"""
Get a new figure with nrows and ncolumns subplots.
Does not initialize the canvases yet.
There is individual kwargs for the individual plotting libraries to use.
"""
raise NotImplementedError("Implement all plot functions in AbstractPlottingLibrary in order to use your own plotting library")

View file

@ -427,8 +427,29 @@ class MiscTests(unittest.TestCase):
warp_m.predict_quantiles(X)
warp_m.log_predictive_density(X, Y)
warp_m.predict_in_warped_space = False
warp_m.predict(X)
warp_m.predict_quantiles(X)
warp_m.plot()
warp_m.predict_in_warped_space = True
warp_m.plot()
def test_offset_regression(self):
#Tests GPy.models.GPOffsetRegression. Using two small time series
#from a sine wave, we confirm the algorithm determines that the
#likelihood is maximised when the offset hyperparameter is approximately
#equal to the actual offset in X between the two time series.
offset = 3
X1 = np.arange(0,50,5.0)[:,None]
X2 = np.arange(0+offset,50+offset,5.0)[:,None]
X = np.vstack([X1,X2])
ind = np.vstack([np.zeros([10,1]),np.ones([10,1])])
X = np.hstack([X,ind])
Y = np.sin((X[0:10,0])/30.0)[:,None]
Y = np.vstack([Y,Y])
m = GPy.models.GPOffsetRegression(X,Y)
m.rbf.lengthscale=5.0 #make it something other than one to check our gradients properly!
assert m.checkgrad(), "Gradients of offset parameters don't match numerical approximations."
m.optimize()
assert np.abs(m.offset[0]-offset)<0.1, ("GPOffsetRegression model failing to estimate correct offset (value estimated = %0.2f instead of %0.2f)" % (m.offset[0], offset))
class GradientTests(np.testing.TestCase):

View file

@ -29,6 +29,7 @@
#===============================================================================
import unittest, numpy as np
import GPy
class TestDebug(unittest.TestCase):
def test_checkFinite(self):
@ -107,3 +108,50 @@ class TestDebug(unittest.TestCase):
self.assertTrue(d[tuple(X[:,4])] == d[tuple(X[:,0])] == [0, 4, 5])
self.assertTrue(d[tuple(X[:,1])] == [1])
def test_offset_cluster(self):
#Tests the GPy.util.cluster_with_offset.cluster utility with a small
#test data set. Not using random noise just in case it occasionally
#causes it not to cluster correctly.
#groundtruth cluster identifiers are: [0,1,1,0]
#data contains a list of the four sets of time series (3 per data point)
data = [np.array([[ 2.18094245, 1.96529789, 2.00265523, 2.18218742, 2.06795428],
[ 1.62254829, 1.75748448, 1.83879347, 1.87531326, 1.52503496],
[ 1.54589609, 1.61607914, 2.00463192, 1.48771394, 1.63339218]]),
np.array([[ 2.86766106, 2.97953437, 2.91958876, 2.92510506, 3.03239241],
[ 2.57368423, 2.59954886, 3.10000395, 2.75806125, 2.89865704],
[ 2.58916318, 2.53698259, 2.63858411, 2.63102504, 2.51853901]]),
np.array([[ 2.77834168, 2.9618564 , 2.88482141, 3.24259745, 2.9716821 ],
[ 2.60675576, 2.67095624, 2.94824436, 2.80520631, 2.87247516],
[ 2.49543562, 2.5492281 , 2.6505866 , 2.65015308, 2.59738616]]),
np.array([[ 1.76783086, 2.21666738, 2.07939706, 1.9268263 , 2.23360121],
[ 1.94305547, 1.94648592, 2.1278921 , 2.09481457, 2.08575238],
[ 1.69336013, 1.72285186, 1.6339506 , 1.61212022, 1.39198698]])]
#inputs contains their associated X values
inputs = [np.array([[ 0. ],
[ 0.68040097],
[ 1.20316795],
[ 1.798749 ],
[ 2.14891733]]), np.array([[ 0. ],
[ 0.51910637],
[ 0.98259352],
[ 1.57442965],
[ 1.82515098]]), np.array([[ 0. ],
[ 0.66645478],
[ 1.59464591],
[ 1.69769551],
[ 1.80932752]]), np.array([[ 0. ],
[ 0.87512108],
[ 1.71881079],
[ 2.67162871],
[ 3.23761907]])]
#try doing the clustering
active = GPy.util.cluster_with_offset.cluster(data,inputs)
#check to see that the clustering has correctly clustered the time series.
clusters = set([frozenset(cluster) for cluster in active])
assert set([1,2]) in clusters, "Offset Clustering algorithm failed"
assert set([0,3]) in clusters, "Offset Clustering algoirthm failed"

View file

@ -16,3 +16,4 @@ from . import initialization
from . import multioutput
from . import parallel
from . import functions
from . import cluster_with_offset

View file

@ -0,0 +1,184 @@
# Copyright (c) 2016, Mike Smith
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import GPy
import numpy as np
import sys #so I can print dots
def get_log_likelihood(inputs,data,clust):
"""Get the LL of a combined set of clusters, ignoring time series offsets.
Get the log likelihood of a cluster without worrying about the fact
different time series are offset. We're using it here really for those
cases in which we only have one cluster to get the loglikelihood of.
arguments:
inputs -- the 'X's in a list, one item per cluster
data -- the 'Y's in a list, one item per cluster
clust -- list of clusters to use
returns a tuple:
log likelihood and the offset (which is always zero for this model)
"""
S = data[0].shape[0] #number of time series
#build a new dataset from the clusters, by combining all clusters together
X = np.zeros([0,1])
Y = np.zeros([0,S])
#for each person in the cluster,
#add their inputs and data to the new dataset
for p in clust:
X = np.vstack([X,inputs[p]])
Y = np.vstack([Y,data[p].T])
#find the loglikelihood. We just add together the LL for each time series.
#ll=0
#for s in range(S):
# m = GPy.models.GPRegression(X,Y[:,s][:,None])
# m.optimize()
# ll+=m.log_likelihood()
m = GPy.models.GPRegression(X,Y)
m.optimize()
ll=m.log_likelihood()
return ll,0
def get_log_likelihood_offset(inputs,data,clust):
"""Get the log likelihood of a combined set of clusters, fitting the offsets
arguments:
inputs -- the 'X's in a list, one item per cluster
data -- the 'Y's in a list, one item per cluster
clust -- list of clusters to use
returns a tuple:
log likelihood and the offset
"""
#if we've only got one cluster, the model has an error, so we want to just
#use normal GPRegression.
if len(clust)==1:
return get_log_likelihood(inputs,data,clust)
S = data[0].shape[0] #number of time series
X = np.zeros([0,2]) #notice the extra column, this is for the cluster index
Y = np.zeros([0,S])
#for each person in the cluster, add their inputs and data to the new
#dataset. Note we add an index identifying which person is which data point.
#This is for the offset model to use, to allow it to know which data points
#to shift.
for i,p in enumerate(clust):
idx = i*np.ones([inputs[p].shape[0],1])
X = np.vstack([X,np.hstack([inputs[p],idx])])
Y = np.vstack([Y,data[p].T])
m = GPy.models.GPOffsetRegression(X,Y)
#TODO: How to select a sensible prior?
m.offset.set_prior(GPy.priors.Gaussian(0,20))
#TODO: Set a sensible start value for the length scale,
#make it long to help the offset fit.
m.optimize()
ll = m.log_likelihood()
offset = m.offset.values[0]
return ll,offset
def cluster(data,inputs,verbose=False):
"""Clusters data
Using the new offset model, this method uses a greedy algorithm to cluster
the data. It starts with all the data points in separate clusters and tests
whether combining them increases the overall log-likelihood (LL). It then
iteratively joins pairs of clusters which cause the greatest increase in
the LL, until no join increases the LL.
arguments:
inputs -- the 'X's in a list, one item per cluster
data -- the 'Y's in a list, one item per cluster
returns a list of the clusters.
"""
N=len(data)
#Define a set of N active cluster
active = []
for p in range(0,N):
active.append([p])
loglikes = np.zeros(len(active))
loglikes[:] = None
pairloglikes = np.zeros([len(active),len(active)])
pairloglikes[:] = None
pairoffset = np.zeros([len(active),len(active)])
it = 0
while True:
if verbose:
it +=1
print("Iteration %d" % it)
#Compute the log-likelihood of each cluster (add them together)
for clusti in range(len(active)):
if verbose:
sys.stdout.write('.')
sys.stdout.flush()
if np.isnan(loglikes[clusti]):
loglikes[clusti], unused_offset = get_log_likelihood_offset(inputs,data,[clusti])
#try combining with each other cluster...
for clustj in range(clusti): #count from 0 to clustj-1
temp = [clusti,clustj]
if np.isnan(pairloglikes[clusti,clustj]):
pairloglikes[clusti,clustj],pairoffset[clusti,clustj] = get_log_likelihood_offset(inputs,data,temp)
seploglikes = np.repeat(loglikes[:,None].T,len(loglikes),0)+np.repeat(loglikes[:,None],len(loglikes),1)
loglikeimprovement = pairloglikes - seploglikes #how much likelihood improves with clustering
top = np.unravel_index(np.nanargmax(pairloglikes-seploglikes), pairloglikes.shape)
#if loglikeimprovement.shape[0]<3:
# #no more clustering to do - this shouldn't happen really unless
# #we've set the threshold to apply clustering to less than 0
# break
#if theres further clustering to be done...
if loglikeimprovement[top[0],top[1]]>0:
active[top[0]].extend(active[top[1]])
offset=pairoffset[top[0],top[1]]
inputs[top[0]] = np.vstack([inputs[top[0]],inputs[top[1]]-offset])
data[top[0]] = np.hstack([data[top[0]],data[top[1]]])
del inputs[top[1]]
del data[top[1]]
del active[top[1]]
#None = message to say we need to recalculate
pairloglikes[:,top[0]] = None
pairloglikes[top[0],:] = None
pairloglikes = np.delete(pairloglikes,top[1],0)
pairloglikes = np.delete(pairloglikes,top[1],1)
loglikes[top[0]] = None
loglikes = np.delete(loglikes,top[1])
else:
break
#if loglikeimprovement[top[0],top[1]]>0:
# print "joined"
# print top
# print offset
# print offsets
# print offsets[top[1]]-offsets[top[0]]
#TODO Add a way to return the offsets applied to all the time series
return active
#starttime = time.time()
#active = cluster(data,inputs)
#endtime = time.time()
#print "TOTAL TIME %0.4f" % (endtime-starttime)