Moved code from gp.py to model.py, as it's model independent, and making it more general allows us to write more simple test code

This commit is contained in:
Michael T Smith 2016-11-10 09:44:27 +00:00
parent dc5320194a
commit 6b68110acf
2 changed files with 196 additions and 192 deletions

View file

@ -13,7 +13,6 @@ from GPy.core.parameterization.variational import VariationalPosterior
import logging
import warnings
from GPy.util.normalizer import MeanNorm
from ..util.linalg import pdinv
logger = logging.getLogger("GP")
class GP(Model):
@ -603,195 +602,4 @@ class GP(Model):
mu_star, var_star = self._raw_predict(x_test)
return self.likelihood.log_predictive_density_sampling(y_test, mu_star, var_star, Y_metadata=Y_metadata, num_samples=num_samples)
def CCD(self):
"""
Code is based on implementation within GPStuff, INLA and the original Sanchez and Sanchez paper (2005)
"""
modal_params = self.optimizer_array[:].copy()
num_free_params = modal_params.shape[0]
# Calculate the numerical hessian for *every* parameter
H = self.numerical_parameter_hessian()
try:
curv = pdinv(H)[0]
except np.linalg.linalg.LinAlgError:
print("Hessian is not positive definite, ensure parameters are at their MAP solution by optimizing, if that doesn't work try modifying the step length parameter")
return
# CCD points are calculated for unit circle, so we take the principle components and scale them such that we have a unit sphere in our parameter space, which we can then map back to parameter space
[w, V] = np.linalg.eig(curv)
z = (V*np.sqrt(w[None,:])).T
# Calculate points, we will do just CCD for now
# First we build the hadamarx matrix from the paper that encodes *all* the points possible in the num_free_params dimensional space.
grow = np.sum(num_free_params >= np.array([1, 2, 3, 4, 6, 7, 9, 12, 18, 22, 30, 39, 53, 70, 93]))
H0 = 1
#Build the design matrix that holds *every* corner on the num_free_params dimensional hyper-cube (the hadamard matrix), H in the paper
for i in range(grow):
H0 = np.vstack([np.hstack([H0, H0]),
np.hstack([H0, -H0])])
# For each additional parameter our dimensional space increases, as does the number of CCD points we require to do the approximate integration. CCD is a method of deciding which points should be included and which can be ignored without a significant effect. It would be ideal but too computationally expensive - when we have many parameters - to use all the corners of the hypercube in our integration
# See Sanchez and Sanchez (2005) for details. on how the points are chosen
walsh_inds = np.array([1, 2, 4, 8, 15, 16, 32, 51, 64, 85, 106, 128, 150, 171, 219, 237, 247, 256, 279, 297, 455, 512, 537, 557, 597, 643, 803, 863, 898, 1024, 1051, 1070, 1112, 1169, 1333, 1345, 1620, 1866, 2048, 2076, 2085, 2158, 2372, 2456, 2618, 2800, 2873, 3127, 3284, 3483, 3557, 3763, 4096, 4125, 4135, 4176, 4435, 4459, 4469, 4497, 4752, 5255, 5732, 5801, 5915, 6100, 6369, 6907, 7069, 8192, 8263, 8351, 8422, 8458, 8571, 8750, 8858, 9124, 9314, 9500, 10026, 10455, 10556, 11778, 11885, 11984, 13548, 14007, 14514, 14965, 15125, 15554, 16384, 16457, 16517, 16609, 16771, 16853, 17022, 17453, 17891, 18073, 18562, 18980, 19030, 19932, 20075, 20745, 21544, 22633, 23200, 24167, 25700, 26360, 26591, 26776, 28443, 28905, 29577, 32705])
used_walsh_inds = walsh_inds[:num_free_params]
# For every additional parameters, we get an additional number of points (which as we decide to drop points using Sanchezes rules, won't necessarily grow at the same rate)
ccd_points = H0[:, used_walsh_inds]
# ccd only gives corner points, so lets add an additional one for the centre point
ccd_points = np.vstack([np.zeros((1,num_free_params)), ccd_points])
# Now we add the points on the unit hyper-sphere that arent on the corners
for i in range(num_free_params):
top_edge = np.zeros(num_free_params)
top_edge[i] = np.sqrt(num_free_params)
bottom_edge = np.zeros(num_free_params)
bottom_edge[i] = -np.sqrt(num_free_params)
ccd_points = np.vstack([ccd_points, top_edge, bottom_edge])
# Find the appropriate scaling such that the edges lie on a boundry of equal density
# First find the density at the mode
log_marginal = lambda p: -self._objective(p)
mode_density = log_marginal(modal_params) # FIXME: We really want to evaluate the log_marginal not the negative objective as it makes more sense, but they are equivalent in this case
# Treating the posterior over hyperparameters as a standard normal, moving z*sqrt(2) should make the likelihood drop by 1, as z should be acting as a unit vector along the principle components of the posterior.
scalings = np.ones_like(ccd_points)
# The below code makes me feel dirty. Pythonise it!
# This is naive scaling, assuming that it is well approximated by a standard normal, in practice you might want to scale in different directions seperately (split normal approximation)
for j in range(num_free_params*2):
temp = np.zeros((1, num_free_params))
if j % 2:
direction = -1
else:
direction = 1
ind = np.ceil(j // 2) # This integer division is required, will not work with python 3
temp[0, ind] = direction
# This is the point mapped onto the contour of a unit gaussian, stretched by the eigenvectors over the marginal likelihood
contour_point = modal_params + np.sqrt(2)*temp.dot(z)
point_density = log_marginal(contour_point)
if mode_density > point_density:
# The scale is based on the amount this point must be scaled in order to be on the contour of the stretched standard normal (stretched by principle components)
scale = np.sqrt(1.0 / (mode_density - point_density))
else:
print("Contour point is higher than mode, must be multimodal or mode not found")
scale = 1 # Print a warning and say "dont scale in this direction"
# Set the scaling for all dimensions where the point is in this direction, when looking at this dimension, clipped.
scalings[ccd_points[:, ind]*direction > 0.0, ind] = np.maximum(np.minimum(scale, 10.0), 1/10.0)
# Make the points *slightly* further out. Aki Vehtari has shown this to be more accurate
over_scale = 1.1
ccd_points = over_scale*scalings*ccd_points
# We have the scaled ccd_points, now we need to map them into parameter space by projecting along principle components and shifting to mode
param_points = ccd_points.dot(z) + modal_params
# Evaluate log marginal at all parameter points
point_densities = np.ones(param_points.shape[0])*np.nan
for point_ind, param_point in enumerate(param_points):
point_densities[point_ind] = log_marginal(param_point)
# Remove nan densities
non_nan_densities = np.isfinite(point_densities).flatten()
point_densities = point_densities[non_nan_densities]
param_points = param_points[non_nan_densities, :]
point_densities = point_densities - np.max(point_densities) # Why don't we have to deal with this shift?
point_densities = np.exp(point_densities)
if num_free_params > 0:
# Each section that the ccd point represents needs an associated weight. Since every point is on the same contour, all weights are the same, except the centre weight
point_weights = 1.0/((2.0*np.pi)**(-num_free_params*0.5) * np.exp(-0.5*num_free_params*over_scale**2) * (param_points.shape[0]-1)*over_scale**2)
centre_weight = (2*np.pi)**(num_free_params*0.5) * (1 - 1.0/over_scale**2)
# normalize
point_weights = point_weights / centre_weight
centre_weight = 1.0
point_densities[1:] *= point_weights
# Normalize density
point_densities /= point_densities.sum()
# Remove small density points
non_small_densities = (point_densities > 0.01/point_densities.size).flatten()
point_densities = point_densities[non_small_densities]
param_points = param_points[non_small_densities, :]
point_densities /= point_densities.sum()
# for dimension_ind, dimension in enumerate(num_free_params):
# # Due to the way we built the ccd_points up, every other one changes to the next direction
# ind = np.ceil(dimension_ind / 2.0)
# if dimension_ind % 2:
# direction = 1
# else:
# direction = -1
# print(z[param_dir,:])
# for direction in [1, -1]:
# for parameter in range(num_free_params):
#If the log likelihood doesn't even drop when we move away from the mode, then we havent found the modal points, or there are multiple modes in the posterior (bad)
# upper_density = self._objective(modal_params + np.sqrt(2)*np.dot(direction,z))
# We want to work out the scaling for every point, so we first figure out which way the vector is scaling from the mode, and scale in that direction
# fig = plt.figure()
# if num_free_params == 3:
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(ccd_points[:,0], ccd_points[:,1], ccd_points[:,2])
# elif num_free_params == 2:
# plt.scatter(ccd_points[:,0], ccd_points[:,1])
# plt.show()
import paramz
transformed_points = param_points.copy()
# Need to transform the points to the space of parameters again
f = np.ones(self.size).astype(bool)
f[self.constraints[paramz.transformations.__fixed__]] = paramz.transformations.FIXED
new_t_points = [c.f(transformed_points[:, ind[f[ind]]]) for c, ind in self.constraints.items() if c != paramz.transformations.__fixed__][0]
return new_t_points, point_densities
def numerical_parameter_hessian(self, step_length=1e-3):
"""
Calculate the numerical hessian for the posterior of the parameters
Often this will want to be done at the modal values of the parameters so the optimizer should be called first
H = [d^2L_dp1dp1, d^2L_dp1dp2, d^2L_dp1dp3, ...;
d^2L_dp2dp1, d^2L_dp2dp2, d^2L_dp3dp3, ...;
...
]
Where the vector p is all of the parameters in param_array, not just parameters of this model class itself
"""
# We use optimizer array as we want to ignore any parameters with fixed values
num_child_params = self.optimizer_array.shape[0]
H = np.eye(num_child_params)
initial_params = self.optimizer_array[:].copy()
# Calculate gradient of marginal likelihood moving one at a time
for i in range(num_child_params):
# Pick out the e vector with one in the parameter that we wish to move
e = H[:,i]
# Get parameters required to evaluate by nudging left and right
left = initial_params - step_length*e
right = initial_params + step_length*e
left_grad = self._grads(left)
right_grad = self._grads(right)
finite_diff = (right_grad - left_grad) / (2*step_length)
#Put in the correct row of the hessian
H[:,i] = finite_diff
return H

View file

@ -3,6 +3,9 @@
from .parameterization.priorizable import Priorizable
from paramz import Model as ParamzModel
import numpy as np
from ..util.linalg import pdinv
class Model(ParamzModel, Priorizable):
def __init__(self, name):
@ -46,3 +49,196 @@ class Model(ParamzModel, Priorizable):
probabilistic, just return your *negative* gradient here!
"""
return -(self._log_likelihood_gradients() + self._log_prior_gradients())
def CCD(self):
"""
Code is based on implementation within GPStuff, INLA and the original Sanchez and Sanchez paper (2005)
"""
modal_params = self.optimizer_array[:].copy()
num_free_params = modal_params.shape[0]
# Calculate the numerical hessian for *every* parameter
H = self.numerical_parameter_hessian()
try:
curv = pdinv(H)[0]
except np.linalg.linalg.LinAlgError:
print("Hessian is not positive definite, ensure parameters are at their MAP solution by optimizing, if that doesn't work try modifying the step length parameter")
return
# CCD points are calculated for unit circle, so we take the principle components and scale them such that we have a unit sphere in our parameter space, which we can then map back to parameter space
[w, V] = np.linalg.eig(curv)
z = (V*np.sqrt(w[None,:])).T
# Calculate points, we will do just CCD for now
# First we build the hadamarx matrix from the paper that encodes *all* the points possible in the num_free_params dimensional space.
grow = np.sum(num_free_params >= np.array([1, 2, 3, 4, 6, 7, 9, 12, 18, 22, 30, 39, 53, 70, 93]))
H0 = 1
#Build the design matrix that holds *every* corner on the num_free_params dimensional hyper-cube (the hadamard matrix), H in the paper
for i in range(grow):
H0 = np.vstack([np.hstack([H0, H0]),
np.hstack([H0, -H0])])
# For each additional parameter our dimensional space increases, as does the number of CCD points we require to do the approximate integration. CCD is a method of deciding which points should be included and which can be ignored without a significant effect. It would be ideal but too computationally expensive - when we have many parameters - to use all the corners of the hypercube in our integration
# See Sanchez and Sanchez (2005) for details. on how the points are chosen
walsh_inds = np.array([1, 2, 4, 8, 15, 16, 32, 51, 64, 85, 106, 128, 150, 171, 219, 237, 247, 256, 279, 297, 455, 512, 537, 557, 597, 643, 803, 863, 898, 1024, 1051, 1070, 1112, 1169, 1333, 1345, 1620, 1866, 2048, 2076, 2085, 2158, 2372, 2456, 2618, 2800, 2873, 3127, 3284, 3483, 3557, 3763, 4096, 4125, 4135, 4176, 4435, 4459, 4469, 4497, 4752, 5255, 5732, 5801, 5915, 6100, 6369, 6907, 7069, 8192, 8263, 8351, 8422, 8458, 8571, 8750, 8858, 9124, 9314, 9500, 10026, 10455, 10556, 11778, 11885, 11984, 13548, 14007, 14514, 14965, 15125, 15554, 16384, 16457, 16517, 16609, 16771, 16853, 17022, 17453, 17891, 18073, 18562, 18980, 19030, 19932, 20075, 20745, 21544, 22633, 23200, 24167, 25700, 26360, 26591, 26776, 28443, 28905, 29577, 32705])
used_walsh_inds = walsh_inds[:num_free_params]
# For every additional parameters, we get an additional number of points (which as we decide to drop points using Sanchezes rules, won't necessarily grow at the same rate)
ccd_points = H0[:, used_walsh_inds]
# ccd only gives corner points, so lets add an additional one for the centre point
ccd_points = np.vstack([np.zeros((1,num_free_params)), ccd_points])
# Now we add the points on the unit hyper-sphere that arent on the corners
for i in range(num_free_params):
top_edge = np.zeros(num_free_params)
top_edge[i] = np.sqrt(num_free_params)
bottom_edge = np.zeros(num_free_params)
bottom_edge[i] = -np.sqrt(num_free_params)
ccd_points = np.vstack([ccd_points, top_edge, bottom_edge])
# Find the appropriate scaling such that the edges lie on a boundry of equal density
# First find the density at the mode
log_marginal = lambda p: -self._objective(p)
mode_density = log_marginal(modal_params) # FIXME: We really want to evaluate the log_marginal not the negative objective as it makes more sense, but they are equivalent in this case
# Treating the posterior over hyperparameters as a standard normal, moving z*sqrt(2) should make the likelihood drop by 1, as z should be acting as a unit vector along the principle components of the posterior.
scalings = np.ones_like(ccd_points)
# The below code makes me feel dirty. Pythonise it!
# This is naive scaling, assuming that it is well approximated by a standard normal, in practice you might want to scale in different directions seperately (split normal approximation)
for j in range(num_free_params*2):
temp = np.zeros((1, num_free_params))
if j % 2:
direction = -1
else:
direction = 1
ind = np.ceil(j // 2) # This integer division is required, will not work with python 3
temp[0, ind] = direction
# This is the point mapped onto the contour of a unit gaussian, stretched by the eigenvectors over the marginal likelihood
contour_point = modal_params + np.sqrt(2)*temp.dot(z)
point_density = log_marginal(contour_point)
if mode_density > point_density:
# The scale is based on the amount this point must be scaled in order to be on the contour of the stretched standard normal (stretched by principle components)
scale = np.sqrt(1.0 / (mode_density - point_density))
else:
print("Contour point is higher than mode, must be multimodal or mode not found")
scale = 1 # Print a warning and say "dont scale in this direction"
# Set the scaling for all dimensions where the point is in this direction, when looking at this dimension, clipped.
scalings[ccd_points[:, ind]*direction > 0.0, ind] = np.maximum(np.minimum(scale, 10.0), 1/10.0)
# Make the points *slightly* further out. Aki Vehtari has shown this to be more accurate
over_scale = 1.1
ccd_points = over_scale*scalings*ccd_points
# We have the scaled ccd_points, now we need to map them into parameter space by projecting along principle components and shifting to mode
param_points = ccd_points.dot(z) + modal_params
# Evaluate log marginal at all parameter points
point_densities = np.ones(param_points.shape[0])*np.nan
for point_ind, param_point in enumerate(param_points):
point_densities[point_ind] = log_marginal(param_point)
# Remove nan densities
non_nan_densities = np.isfinite(point_densities).flatten()
point_densities = point_densities[non_nan_densities]
param_points = param_points[non_nan_densities, :]
point_densities = point_densities - np.max(point_densities) # Why don't we have to deal with this shift?
point_densities = np.exp(point_densities)
if num_free_params > 0:
# Each section that the ccd point represents needs an associated weight. Since every point is on the same contour, all weights are the same, except the centre weight
point_weights = 1.0/((2.0*np.pi)**(-num_free_params*0.5) * np.exp(-0.5*num_free_params*over_scale**2) * (param_points.shape[0]-1)*over_scale**2)
centre_weight = (2*np.pi)**(num_free_params*0.5) * (1 - 1.0/over_scale**2)
# normalize
point_weights = point_weights / centre_weight
centre_weight = 1.0
point_densities[1:] *= point_weights
# Normalize density
point_densities /= point_densities.sum()
# Remove small density points
non_small_densities = (point_densities > 0.01/point_densities.size).flatten()
point_densities = point_densities[non_small_densities]
param_points = param_points[non_small_densities, :]
point_densities /= point_densities.sum()
# for dimension_ind, dimension in enumerate(num_free_params):
# # Due to the way we built the ccd_points up, every other one changes to the next direction
# ind = np.ceil(dimension_ind / 2.0)
# if dimension_ind % 2:
# direction = 1
# else:
# direction = -1
# print(z[param_dir,:])
# for direction in [1, -1]:
# for parameter in range(num_free_params):
#If the log likelihood doesn't even drop when we move away from the mode, then we havent found the modal points, or there are multiple modes in the posterior (bad)
# upper_density = self._objective(modal_params + np.sqrt(2)*np.dot(direction,z))
# We want to work out the scaling for every point, so we first figure out which way the vector is scaling from the mode, and scale in that direction
# fig = plt.figure()
# if num_free_params == 3:
# ax = fig.add_subplot(111, projection='3d')
# ax.scatter(ccd_points[:,0], ccd_points[:,1], ccd_points[:,2])
# elif num_free_params == 2:
# plt.scatter(ccd_points[:,0], ccd_points[:,1])
# plt.show()
import paramz
transformed_points = param_points.copy()
# Need to transform the points to the space of parameters again
f = np.ones(self.size).astype(bool)
f[self.constraints[paramz.transformations.__fixed__]] = paramz.transformations.FIXED
new_t_points = [c.f(transformed_points[:, ind[f[ind]]]) for c, ind in self.constraints.items() if c != paramz.transformations.__fixed__][0]
return new_t_points, point_densities
def numerical_parameter_hessian(self, step_length=1e-3):
"""
Calculate the numerical hessian for the posterior of the parameters
Often this will want to be done at the modal values of the parameters so the optimizer should be called first
H = [d^2L_dp1dp1, d^2L_dp1dp2, d^2L_dp1dp3, ...;
d^2L_dp2dp1, d^2L_dp2dp2, d^2L_dp3dp3, ...;
...
]
Where the vector p is all of the parameters in param_array, not just parameters of this model class itself
"""
# We use optimizer array as we want to ignore any parameters with fixed values
num_child_params = self.optimizer_array.shape[0]
H = np.eye(num_child_params)
initial_params = self.optimizer_array[:].copy()
# Calculate gradient of marginal likelihood moving one at a time
for i in range(num_child_params):
# Pick out the e vector with one in the parameter that we wish to move
e = H[:,i]
# Get parameters required to evaluate by nudging left and right
left = initial_params - step_length*e
right = initial_params + step_length*e
left_grad = self._grads(left)
right_grad = self._grads(right)
finite_diff = (right_grad - left_grad) / (2*step_length)
#Put in the correct row of the hessian
H[:,i] = finite_diff
return H