import numpy as np
import scipy as sp
import GPy
from GPy.util.linalg import jitchol
from functools import partial
from GPy.likelihoods.likelihood import likelihood
from GPy.util.linalg import pdinv,mdot
from scipy.stats import norm


class Laplace(likelihood):
    """Laplace approximation to a posterior"""

    def __init__(self, data, likelihood_function):
        """
        Laplace Approximation

        First find the moments \hat{f} and the hessian at this point (using Newton-Raphson)
        then find the z^{prime} which allows this to be a normalised gaussian instead of a
        non-normalized gaussian

        Finally we must compute the GP variables (i.e. generate some Y^{squiggle} and z^{squiggle}
        which makes a gaussian the same as the laplace approximation

        Arguments
        ---------

        :data: @todo
        :likelihood_function: @todo

        """
        self.data = data
        self.likelihood_function = likelihood_function

        #Inital values
        self.N, self.D = self.data.shape

    def _compute_GP_variables(self):
        """
        Generates data Y which would give the normal distribution identical to the laplace approximation

        GPy expects a likelihood to be gaussian, so need to caluclate the points Y^{squiggle} and Z^{squiggle}
        that makes the posterior match that found by a laplace approximation to a non-gaussian likelihood

        Given we are approximating $p(y|f)p(f)$ with a normal distribution (given $p(y|f)$ is not normal)
        then we have a rescaled normal distibution z*N(f|f_hat,hess_hat^-1) with the same area as p(y|f)p(f)
        due to the z rescaling.

        at the moment the data Y correspond to the normal approximation z*N(f|f_hat,hess_hat^1)

        This function finds the data D=(Y_tilde,X) that would produce z*N(f|f_hat,hess_hat^1)
        giving a normal approximation of z_tilde*p(Y_tilde|f,X)p(f)

        $$\tilde{Y} = \tilde{\Sigma} Hf$$
        where
        $$\tilde{\Sigma}^{-1} = H - K^{-1}$$
        i.e. $$\tilde{\Sigma}^{-1} = diag(\nabla\nabla \log(y|f))$$
        since $diag(\nabla\nabla \log(y|f)) = H - K^{-1}$
        and $$\ln \tilde{z} = \ln z + \frac{N}{2}\ln 2\pi + \frac{1}{2}\tilde{Y}\tilde{\Sigma}^{-1}\tilde{Y}$$

        """
        self.Sigma_tilde = self.hess_hat -
        self.Z =
        #self.Y =
        #self.YYT =
        #self.covariance_matrix =
        #self.precision =

    def fit_full(self, K):
        """
        The laplace approximation algorithm
        For nomenclature see Rasmussen & Williams 2006
        :K: Covariance matrix
        """
        f = np.zeros((self.N, 1))
        #K = np.diag(np.ones(self.N))
        (self.Ki, _, _, self.log_Kdet) = pdinv(K)
        obj_constant = (0.5 * self.log_Kdet) - ((0.5 * self.N) * np.log(2 * np.pi))

        #Find \hat(f) using a newton raphson optimizer for example
        #TODO: Add newton-raphson as subclass of optimizer class

        #FIXME: Can we get rid of this horrible reshaping?
        def obj(f):
            f = f[:, None]
            res = -1 * (self.likelihood_function.link_function(self.data, f) - 0.5 * mdot(f.T, (self.Ki, f)) + obj_constant)
            return float(res)

        def obj_grad(f):
            f = f[:, None]
            res = -1 * (self.likelihood_function.link_grad(self.data, f) - mdot(self.Ki, f))
            return np.squeeze(res)

        def obj_hess(f):
            f = f[:, None]
            res = -1 * (np.diag(self.likelihood_function.link_hess(self.data, f)) - self.Ki)
            return np.squeeze(res)

        self.f_hat = sp.optimize.fmin_ncg(obj, f, fprime=obj_grad, fhess=obj_hess)
        print self.f_hat

        #At this point get the hessian matrix
        self.hess_hat = obj_hess(self.f_hat)

        #Need to add the constant as we previously were trying to avoid computing it (seems like a small overhead though...)
        self.height_unnormalised = -1*obj(self.f_hat) #FIXME: Is it - obj constant and *-1?
        #z_hat is how much we need to scale the normal distribution by to get the area of our approximation close to
        #the area of p(f)p(y|f) we do this by matching the height of the distributions at the mode
        #z_hat = -0.5*ln|H| - 0.5*ln|K| - 0.5*f_hat*K^{-1}*f_hat \sum_{n} ln p(y_n|f_n)
        self.z_hat = np.exp(-0.5*np.log(np.linalg.det(hess_hat)) + self.height_unnormalised)

        return self._compute_GP_variables()