GPy/GPy/core/model.py

# Copyright (c) 2012-2014, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
from .parameterization.priorizable import Priorizable
from paramz import Model as ParamzModel

import numpy as np
from ..util.linalg import pdinv

class Model(ParamzModel, Priorizable):

    def __init__(self, name):
        super(Model, self).__init__(name)  # Parameterized.__init__(self)

    def log_likelihood(self):
        raise NotImplementedError("this needs to be implemented to use the model class")

    def _log_likelihood_gradients(self):
        return self.gradient#.copy()

    def objective_function(self):
        """
        The objective function for the given algorithm.

        This function is the true objective, which wants to be minimized.
        Note that all parameters are already set and in place, so you just need
        to return the objective function here.

        For probabilistic models this is the negative log_likelihood
        (including the MAP prior), so we return it here. If your model is not
        probabilistic, just return your objective to minimize here!
        """
        return -float(self.log_likelihood()) - self.log_prior()

    def objective_function_gradients(self):
        """
        The gradients for the objective function for the given algorithm.
        The gradients are w.r.t. the *negative* objective function, as
        this framework works with *negative* log-likelihoods as a default.

        You can find the gradient for the parameters in self.gradient at all times.
        This is the place, where gradients get stored for parameters.

        This function is the true objective, which wants to be minimized.
        Note that all parameters are already set and in place, so you just need
        to return the gradient here.

        For probabilistic models this is the gradient of the negative log_likelihood
        (including the MAP prior), so we return it here. If your model is not
        probabilistic, just return your *negative* gradient here!
        """
        return -(self._log_likelihood_gradients() + self._log_prior_gradients())

    def CCD(self):
        """
        Code is based on implementation within GPStuff, INLA and the original Sanchez and Sanchez paper (2005)
        """
        modal_params = self.optimizer_array[:].copy()
        num_free_params = modal_params.shape[0]

        # Calculate the numerical hessian for *every* parameter
        H = self.numerical_parameter_hessian()

        try:
            curv = pdinv(H)[0]
        except np.linalg.linalg.LinAlgError:
            print("Hessian is not positive definite, ensure parameters are at their MAP solution by optimizing, if that doesn't work try modifying the step length parameter")
            return
        # CCD points are calculated for unit circle, so we take the principle components and scale them such that we have a unit sphere in our parameter space, which we can then map back to parameter space
        [w, V] = np.linalg.eig(curv)
        z = (V*np.sqrt(w[None,:])).T

        # Calculate points, we will do just CCD for now

        # First we build the hadamarx matrix from the paper that encodes *all* the points possible in the num_free_params dimensional space.
        grow = np.sum(num_free_params >= np.array([1, 2, 3, 4, 6, 7, 9, 12, 18, 22, 30, 39, 53, 70, 93]))

        H0 = 1

        #Build the design matrix that holds *every* corner on the num_free_params dimensional hyper-cube (the hadamard matrix), H in the paper
        for i in range(grow):
            H0 = np.vstack([np.hstack([H0, H0]),
                            np.hstack([H0, -H0])])

        # For each additional parameter our dimensional space increases, as does the number of CCD points we require to do the approximate integration. CCD is a method of deciding which points should be included and which can be ignored without a significant effect. It would be ideal but too computationally expensive - when we have many parameters - to use all the corners of the hypercube in our integration
        # See Sanchez and Sanchez (2005) for details. on how the points are chosen
        walsh_inds = np.array([1, 2, 4, 8, 15, 16, 32, 51, 64, 85, 106, 128, 150, 171, 219, 237, 247, 256, 279, 297, 455, 512, 537, 557, 597, 643, 803, 863, 898, 1024, 1051, 1070, 1112, 1169, 1333, 1345, 1620, 1866, 2048, 2076, 2085, 2158, 2372, 2456, 2618, 2800, 2873, 3127, 3284, 3483, 3557, 3763, 4096, 4125, 4135, 4176, 4435, 4459, 4469, 4497, 4752, 5255, 5732, 5801, 5915, 6100, 6369, 6907, 7069, 8192, 8263, 8351, 8422, 8458, 8571, 8750, 8858, 9124, 9314, 9500, 10026, 10455, 10556, 11778, 11885, 11984, 13548, 14007, 14514, 14965, 15125, 15554, 16384, 16457, 16517, 16609, 16771, 16853, 17022, 17453, 17891, 18073, 18562, 18980, 19030, 19932, 20075, 20745, 21544, 22633, 23200, 24167, 25700, 26360, 26591, 26776, 28443, 28905, 29577, 32705])

        used_walsh_inds = walsh_inds[:num_free_params]

        # For every additional parameters, we get an additional number of points (which as we decide to drop points using Sanchezes rules, won't necessarily grow at the same rate)
        ccd_points = H0[:, used_walsh_inds]

        # ccd only gives corner points, so lets add an additional one for the centre point
        ccd_points = np.vstack([np.zeros((1,num_free_params)), ccd_points])

        # Now we add the points on the unit hyper-sphere that arent on the corners
        for i in range(num_free_params):
            top_edge = np.zeros(num_free_params)
            top_edge[i] = np.sqrt(num_free_params)
            bottom_edge = np.zeros(num_free_params)
            bottom_edge[i] = -np.sqrt(num_free_params)
            ccd_points = np.vstack([ccd_points, top_edge, bottom_edge])

        # Find the appropriate scaling such that the edges lie on a boundry of equal density
        # First find the density at the mode
        log_marginal = lambda p: -self._objective(p)
        mode_density = log_marginal(modal_params)  # FIXME: We really want to evaluate the log_marginal not the negative objective as it makes more sense, but they are equivalent in this case

        # Treating the posterior over hyperparameters as a standard normal, moving z*sqrt(2) should make the likelihood drop by 1, as z should be acting as a unit vector along the principle components of the posterior.

        scalings = np.ones_like(ccd_points)
        # The below code makes me feel dirty. Pythonise it!
        # This is naive scaling, assuming that it is well approximated by a standard normal, in practice you might want to scale in different directions seperately (split normal approximation)
        for j in range(num_free_params*2):
            temp = np.zeros((1, num_free_params))
            if j % 2:
                direction = -1
            else:
                direction = 1
            ind = np.ceil(j // 2)  # This integer division is required, will not work with python 3
            temp[0, ind] = direction

            # This is the point mapped onto the contour of a unit gaussian, stretched by the eigenvectors over the marginal likelihood
            contour_point = modal_params + np.sqrt(2)*temp.dot(z)
            point_density = log_marginal(contour_point)

            if mode_density > point_density:
                # The scale is based on the amount this point must be scaled in order to be on the contour of the stretched standard normal (stretched by principle components)
                scale = np.sqrt(1.0 / (mode_density - point_density))
            else:
                print("Contour point is higher than mode, must be multimodal or mode not found")
                scale = 1  # Print a warning and say "dont scale in this direction"

            # Set the scaling for all dimensions where the point is in this direction, when looking at this dimension, clipped.
            scalings[ccd_points[:, ind]*direction > 0.0, ind] = np.maximum(np.minimum(scale, 10.0), 1/10.0)

        # Make the points *slightly* further out. Aki Vehtari has shown this to be more accurate
        over_scale = 1.1
        ccd_points = over_scale*scalings*ccd_points

        # We have the scaled ccd_points, now we need to map them into parameter space by projecting along principle components and shifting to mode
        param_points = ccd_points.dot(z) + modal_params

        # Evaluate log marginal at all parameter points
        point_densities = np.ones(param_points.shape[0])*np.nan
        for point_ind, param_point in enumerate(param_points):
            point_densities[point_ind] = log_marginal(param_point)

        # Remove nan densities
        non_nan_densities = np.isfinite(point_densities).flatten()
        point_densities = point_densities[non_nan_densities]
        param_points = param_points[non_nan_densities, :]

        point_densities = point_densities - np.max(point_densities)  # Why don't we have to deal with this shift?
        point_densities = np.exp(point_densities)

        if num_free_params > 0:
            # Each section that the ccd point represents needs an associated weight. Since every point is on the same contour, all weights are the same, except the centre weight
            point_weights = 1.0/((2.0*np.pi)**(-num_free_params*0.5) * np.exp(-0.5*num_free_params*over_scale**2) * (param_points.shape[0]-1)*over_scale**2)
            centre_weight = (2*np.pi)**(num_free_params*0.5) * (1 - 1.0/over_scale**2)
            # normalize
            point_weights = point_weights / centre_weight
            centre_weight = 1.0

            point_densities[1:] *= point_weights

        # Normalize density
        point_densities /= point_densities.sum()

        # Remove small density points
        non_small_densities = (point_densities > 0.01/point_densities.size).flatten()
        point_densities = point_densities[non_small_densities]
        param_points = param_points[non_small_densities, :]
        point_densities /= point_densities.sum()

        # for dimension_ind, dimension in enumerate(num_free_params):
            # # Due to the way we built the ccd_points up, every other one changes to the next direction
            # ind = np.ceil(dimension_ind / 2.0)
            # if dimension_ind % 2:
                # direction = 1
            # else:
                # direction = -1
            # print(z[param_dir,:])
        # for direction in [1, -1]:
            # for parameter in range(num_free_params):
                #If the log likelihood doesn't even drop when we move away from the mode, then we havent found the modal points, or there are multiple modes in the posterior (bad)
                # upper_density = self._objective(modal_params + np.sqrt(2)*np.dot(direction,z))

                # We want to work out the scaling for every point, so we first figure out which way the vector is scaling from the mode, and scale in that direction


        # fig = plt.figure()
        # if num_free_params == 3:
            # ax = fig.add_subplot(111, projection='3d')
            # ax.scatter(ccd_points[:,0], ccd_points[:,1], ccd_points[:,2])
        # elif num_free_params == 2:
            # plt.scatter(ccd_points[:,0], ccd_points[:,1])
        # plt.show()
        import paramz

        transformed_points = param_points.copy()
        print transformed_points.shape
        #alan's original code to transform those parameters, to the true space of parameters again
        #mike's change: some parameters have no transform, and thus won't be called by any of the
        #iterations through m2.constraints.items(). To handle these we keep track of those parameters
        #not included, and then add them (untransformed) at the end.
        f = np.ones(self.size).astype(bool)
        f[self.constraints[paramz.transformations.__fixed__]] = paramz.transformations.FIXED

        new_t_points = []
        todo = range(0,sum(f))
        new_t_points = np.zeros_like(transformed_points)
        print new_t_points.shape
        for c, ind in self.constraints.items():
            print c,ind
            if c != paramz.transformations.__fixed__:
                for i in ind[f[ind]]:
                    new_t_points[:,i] = (c.f(transformed_points[:, i]))
                    print i
                    todo.remove(i)

        print todo
        for i in todo:
            print i
            new_t_points[:,i] = transformed_points[:, i]


        return new_t_points, point_densities

    def numerical_parameter_hessian(self, step_length=1e-3):
        """
        Calculate the numerical hessian for the posterior of the parameters

        Often this will want to be done at the modal values of the parameters so the optimizer should be called first

        H = [d^2L_dp1dp1, d^2L_dp1dp2, d^2L_dp1dp3, ...;
             d^2L_dp2dp1, d^2L_dp2dp2, d^2L_dp3dp3, ...;
            ...
             ]
        Where the vector p is all of the parameters in param_array, not just parameters of this model class itself
        """
        # We use optimizer array as we want to ignore any parameters with fixed values
        num_child_params = self.optimizer_array.shape[0]

        H = np.eye(num_child_params)

        initial_params = self.optimizer_array[:].copy()
        # Calculate gradient of marginal likelihood moving one at a time
        for i in range(num_child_params):
            # Pick out the e vector with one in the parameter that we wish to move
            e = H[:,i]
            # Get parameters required to evaluate by nudging left and right
            left = initial_params - step_length*e
            right = initial_params + step_length*e

            left_grad = self._grads(left)
            right_grad = self._grads(right)

            finite_diff = (right_grad - left_grad) / (2*step_length)

            #Put in the correct row of the hessian
            H[:,i] = finite_diff

        return H