Part working on symbolics. Replacing data_resources.json with the correct full file (-hapmap). Don't know why we've gone for separate create file ...

This commit is contained in:
Neil Lawrence 2014-04-21 10:05:54 +02:00
parent 196732b83b
commit ad5f03cf91
4 changed files with 688 additions and 317 deletions

View file

@ -14,9 +14,9 @@ except ImportError:
sympy_available=False
if sympy_available:
# These are likelihoods that rely on symbolic.
from symbolic2 import Symbolic
#from sstudent_t import SstudentT
#from negative_binomial import Negative_binomial
from skew_normal2 import Skew_normal
#from skew_exponential import Skew_exponential
from symbolic import Symbolic
from sstudent_t import SstudentT
from negative_binomial import Negative_binomial
from skew_normal import Skew_normal
from skew_exponential import Skew_exponential
#from null_category import Null_category

View file

@ -16,8 +16,7 @@ import link_functions
from symbolic import Symbolic
from scipy import stats
if sympy_available:
class Negative_binomial(Symbolic):
class Negative_binomial(Symbolic):
"""
Negative binomial
@ -31,17 +30,18 @@ if sympy_available:
.. See also::
symbolic.py, for the parent class
"""
def __init__(self, gp_link=None):
def __init__(self, gp_link=None, dispersion=1.0):
parameters = {'dispersion':dispersion}
if gp_link is None:
gp_link = link_functions.Identity()
dispersion = sym.Symbol('dispersion', positive=True, real=True)
y = sym.Symbol('y', nonnegative=True, integer=True)
f = sym.Symbol('f', positive=True, real=True)
y_0 = sym.Symbol('y_0', nonnegative=True, integer=True)
f_0 = sym.Symbol('f_0', positive=True, real=True)
gp_link = link_functions.Log()
log_pdf=dispersion*sym.log(dispersion) - (dispersion+y)*sym.log(dispersion+f) + gammaln(y+dispersion) - gammaln(y+1) - gammaln(dispersion) + y*sym.log(f)
log_pdf=dispersion*sym.log(dispersion) - (dispersion+y_0)*sym.log(dispersion+f_0) + gammaln(y_0+dispersion) - gammaln(y_0+1) - gammaln(dispersion) + y_0*sym.log(f_0)
#log_pdf= -(dispersion+y)*logisticln(f-sym.log(dispersion)) + gammaln(y+dispersion) - gammaln(y+1) - gammaln(dispersion) + y*(f-sym.log(dispersion))
super(Negative_binomial, self).__init__(log_pdf=log_pdf, gp_link=gp_link, name='Negative_binomial')
super(Negative_binomial, self).__init__(log_pdf=log_pdf, parameters=parameters, gp_link=gp_link, name='Negative_binomial')
# TODO: Check this.
self.log_concave = False

View file

@ -1,31 +1,19 @@
# Copyright (c) 2014 GPy Authors
# Licensed under the BSD 3-clause license (see LICENSE.txt)
try:
import sympy as sym
sympy_available=True
from sympy.utilities.lambdify import lambdify
except ImportError:
sympy_available=False
import sympy as sym
import numpy as np
import link_functions
from scipy import stats, integrate
from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma
from GPy.util.functions import normcdf, normcdfln, logistic, logisticln
from likelihood import Likelihood
from ..core.parameterization import Param
from ..core.symbolic import Symbolic_core
if sympy_available:
class Symbolic(Likelihood):
class Symbolic(Likelihood, Symbolic_core):
"""
Symbolic likelihood.
Likelihood where the form of the likelihood is provided by a sympy expression.
"""
def __init__(self, log_pdf=None, logZ=None, missing_log_pdf=None, gp_link=None, name='symbolic', log_concave=False, param=None, func_modules=[]):
def __init__(self, log_pdf=None, logZ=None, missing_log_pdf=None, gp_link=None, name='symbolic', log_concave=False, parameters=None, func_modules=[]):
if gp_link is None:
gp_link = link_functions.Identity()
@ -33,115 +21,76 @@ if sympy_available:
if log_pdf is None:
raise ValueError, "You must provide an argument for the log pdf."
self.func_modules = func_modules
self.func_modules += [{'gamma':gamma,
'gammaln':gammaln,
'erf':erf, 'erfc':erfc,
'erfcx':erfcx,
'polygamma':polygamma,
'normcdf':normcdf,
'normcdfln':normcdfln,
'logistic':logistic,
'logisticln':logisticln},
'numpy']
Likelihood.__init__(self, gp_link, name=name)
functions = {'log_pdf':log_pdf}
self.cacheable = ['F', 'Y']
super(Symbolic, self).__init__(gp_link, name=name)
self.missing_data = False
self._sym_log_pdf = log_pdf
if missing_log_pdf:
self.missing_data = True
self._sym_missing_log_pdf = missing_log_pdf
functions['missing_log_pdf']=missing_log_pdf
# pull the variable names out of the symbolic pdf
sym_vars = [e for e in self._sym_log_pdf.atoms() if e.is_Symbol]
self._sym_f = [e for e in sym_vars if e.name=='f']
if not self._sym_f:
raise ValueError('No variable f in log pdf.')
self._sym_y = [e for e in sym_vars if e.name=='y']
if not self._sym_y:
raise ValueError('No variable y in log pdf.')
self._sym_theta = sorted([e for e in sym_vars if not (e.name=='f' or e.name=='y')],key=lambda e:e.name)
theta_names = [theta.name for theta in self._sym_theta]
if self.missing_data:
# pull the variable names out of missing data
sym_vars = [e for e in self._sym_missing_log_pdf.atoms() if e.is_Symbol]
sym_f = [e for e in sym_vars if e.name=='f']
if not sym_f:
raise ValueError('No variable f in missing data log pdf.')
sym_y = [e for e in sym_vars if e.name=='y']
if sym_y:
raise ValueError('Data is present in missing data portion of likelihood.')
# additional missing data parameters
missing_theta = sorted([e for e in sym_vars if not (e.name=='f' or e.name=='missing' or e.name in theta_names)],key=lambda e:e.name)
self._sym_theta += missing_theta
self._sym_theta = sorted(self._sym_theta, key=lambda e:e.name)
# These are all the arguments need to compute likelihoods.
self.arg_list = self._sym_y + self._sym_f + self._sym_theta
# these are arguments for computing derivatives.
derivative_arguments = self._sym_f + self._sym_theta
# Do symbolic work to compute derivatives.
self._log_pdf_derivatives = {theta.name : stabilise(sym.diff(self._sym_log_pdf,theta)) for theta in derivative_arguments}
self._log_pdf_second_derivatives = {theta.name : stabilise(sym.diff(self._log_pdf_derivatives['f'],theta)) for theta in derivative_arguments}
self._log_pdf_third_derivatives = {theta.name : stabilise(sym.diff(self._log_pdf_second_derivatives['f'],theta)) for theta in derivative_arguments}
if self.missing_data:
# Do symbolic work to compute derivatives.
self._missing_log_pdf_derivatives = {theta.name : stabilise(sym.diff(self._sym_missing_log_pdf,theta)) for theta in derivative_arguments}
self._missing_log_pdf_second_derivatives = {theta.name : stabilise(sym.diff(self._missing_log_pdf_derivatives['f'],theta)) for theta in derivative_arguments}
self._missing_log_pdf_third_derivatives = {theta.name : stabilise(sym.diff(self._missing_log_pdf_second_derivatives['f'],theta)) for theta in derivative_arguments}
# Add parameters to the model.
for theta in self._sym_theta:
val = 1.0
# TODO: need to decide how to handle user passing values for the se parameter vectors.
if param is not None:
if param.has_key(theta.name):
val = param[theta.name]
setattr(self, theta.name, Param(theta.name, val, None))
self.add_parameters(getattr(self, theta.name))
self.ep_analytic = False
if logZ:
self.ep_analytic = True
functions['logZ'] = logZ
self.cacheable += ['M', 'V']
Symbolic_core.__init__(self, functions, cacheable=self.cacheable, derivatives = ['F', 'theta'], parameters=parameters, func_modules=func_modules)
# TODO: Is there an easy way to check whether the pdf is log
# concave? For the moment, need user to specify.
self.log_concave = log_concave
# initialise code arguments
self._arguments = {}
# generate the code for the pdf and derivatives
self._gen_code()
def list_functions(self):
"""Return a list of all symbolic functions in the model and their names."""
def _gen_code(self):
"""Generate the code from the symbolic parts that will be used for likleihod computation."""
# TODO: Check here whether theano is available and set up
# functions accordingly.
symbolic_functions = [self._sym_log_pdf]
deriv_list = [self._log_pdf_derivatives, self._log_pdf_second_derivatives, self._log_pdf_third_derivatives]
symbolic_functions += [deriv[key] for key in sorted(deriv.keys()) for deriv in deriv_list]
def _set_derivatives(self, derivatives):
# these are arguments for computing derivatives.
print "Whoop"
Symbolic_core._set_derivatives(self, derivatives)
# add second and third derivatives for Laplace approximation.
derivative_arguments = []
if derivatives is not None:
for derivative in derivatives:
derivative_arguments += self.variables[derivative]
exprs = ['log_pdf']
if self.missing_data:
symbolic_functions+=[self._sym_missing_log_pdf]
deriv_list = [self._missing_log_pdf_derivatives, self._missing_log_pdf_second_derivatives, self._missing_log_pdf_third_derivatives]
symbolic_functions += [deriv[key] for key in sorted(deriv.keys()) for deriv in deriv_list]
# self._log_pdf_function = lambdify(self.arg_list, self._sym_log_pdf, self.func_modules)
exprs.append('missing_log_pdf')
for expr in exprs:
self.expressions[expr]['second_derivative'] = {theta.name : self.stabilize(sym.diff(self.expressions[expr]['derivative']['f_0'], theta)) for theta in derivative_arguments}
self.expressions[expr]['third_derivative'] = {theta.name : self.stabilize(sym.diff(self.expressions[expr]['second_derivative']['f_0'], theta)) for theta in derivative_arguments}
if self.ep_analytic:
derivative_arguments = [M]
# add second derivative for EP
exprs = ['logZ']
if self.missing_data:
exprs.append('missing_logZ')
for expr in exprs:
self.expressions[expr]['second_derivative'] = {theta.name : self.stabilize(sym.diff(self.expressions[expr]['derivative'], theta)) for theta in derivative_arguments}
# # compute code for derivatives
# self._derivative_code = {key: lambdify(self.arg_list, self._log_pdf_derivatives[key], self.func_modules) for key in self._log_pdf_derivatives.keys()}
# self._second_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_second_derivatives[key], self.func_modules) for key in self._log_pdf_second_derivatives.keys()}
# self._third_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_third_derivatives[key], self.func_modules) for key in self._log_pdf_third_derivatives.keys()}
# if self.missing_data:
# self._missing_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_derivatives[key], self.func_modules) for key in self._missing_log_pdf_derivatives.keys()}
# self._missing_second_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_second_derivatives[key], self.func_modules) for key in self._missing_log_pdf_second_derivatives.keys()}
# self._missing_third_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_third_derivatives[key], self.func_modules) for key in self._missing_log_pdf_third_derivatives.keys()}
def eval_update_cache(self, Y, **kwargs):
# TODO: place checks for inf/nan in here
# for all provided keywords
Symbolic_core.eval_update_cache(self, Y=Y, **kwargs)
# Y = np.atleast_2d(Y)
# for variable, code in sorted(self.code['parameters_changed'].iteritems()):
# self._set_attribute(variable, eval(code, self.namespace))
# for i, theta in enumerate(self.variables['Y']):
# missing = np.isnan(Y[:, i])
# self._set_attribute('missing_' + str(i), missing)
# self._set_attribute(theta.name, value[missing, i][:, None])
# for variable, value in kwargs.items():
# # update their cached values
# if value is not None:
# if variable == 'F' or variable == 'M' or variable == 'V' or variable == 'Y_metadata':
# for i, theta in enumerate(self.variables[variable]):
# self._set_attribute(theta.name, value[:, i][:, None])
# else:
# self._set_attribute(theta.name, value[:, i])
# for variable, code in sorted(self.code['update_cache'].iteritems()):
# self._set_attribute(variable, eval(code, self.namespace))
# TODO: compute EP code parts based on logZ. We need dlogZ/dmu, d2logZ/dmu2 and dlogZ/dtheta
def parameters_changed(self):
pass
@ -160,43 +109,30 @@ if sympy_available:
# computed in the inference code. TODO: Thought: How does this
# effect EP? Shouldn't this be done by a separate
# Laplace-approximation specific call?
for grad, theta in zip(grads, self._sym_theta):
for theta, grad in zip(self.variables['theta'], grads):
parameter = getattr(self, theta.name)
setattr(parameter, 'gradient', grad)
def _arguments_update(self, f, y):
"""Set up argument lists for the derivatives."""
# If we do make use of Theano, then at this point we would
# need to do a lot of precomputation to ensure that the
# likelihoods and gradients are computed together, then check
# for parameter changes before updating.
for i, fvar in enumerate(self._sym_f):
self._arguments[fvar.name] = f
for i, yvar in enumerate(self._sym_y):
self._arguments[yvar.name] = y
for theta in self._sym_theta:
self._arguments[theta.name] = np.asarray(getattr(self, theta.name))
def pdf_link(self, inv_link_f, y, Y_metadata=None):
def pdf_link(self, f, y, Y_metadata=None):
"""
Likelihood function given inverse link of f.
:param inv_link_f: inverse link of latent variables.
:type inv_link_f: Nx1 array
:param f: inverse link of latent variables.
:type f: Nx1 array
:param y: data
:type y: Nx1 array
:param Y_metadata: Y_metadata which is not used in student t distribution
:returns: likelihood evaluated for this point
:rtype: float
"""
return np.exp(self.logpdf_link(inv_link_f, y, Y_metadata=None))
return np.exp(self.logpdf_link(f, y, Y_metadata=None))
def logpdf_link(self, inv_link_f, y, Y_metadata=None):
def logpdf_link(self, f, y, Y_metadata=None):
"""
Log Likelihood Function given inverse link of latent variables.
:param inv_inv_link_f: latent variables (inverse link of f)
:type inv_inv_link_f: Nx1 array
:param f: latent variables (inverse link of f)
:type f: Nx1 array
:param y: data
:type y: Nx1 array
:param Y_metadata: Y_metadata
@ -204,20 +140,23 @@ if sympy_available:
:rtype: float
"""
assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
self._arguments_update(inv_link_f, y)
assert np.atleast_1d(f).shape == np.atleast_1d(y).shape
if self.missing_data:
ll = np.where(np.isnan(y), self._missing_log_pdf_function(**self._missing_arguments), self._log_pdf_function(**self._arguments))
missing_flag = np.isnan(y)
not_missing_flag = np.logical_not(missing_flag)
ll = self.eval_function('missing_log_pdf', F=f[missing_flag]).sum()
ll += self.eval_function('log_pdf', F=f[not_missing_flag], Y=y[not_missing_flag], Y_metadata=Y_metadata[not_missing_flag]).sum()
else:
ll = np.where(np.isnan(y), 0., self._log_pdf_function(**self._arguments))
return np.sum(ll)
ll = self.eval_function('log_pdf', F=f, Y=y, Y_metadata=Y_metadata).sum()
def dlogpdf_dlink(self, inv_link_f, y, Y_metadata=None):
return ll
def dlogpdf_dlink(self, f, y, Y_metadata=None):
"""
Gradient of log likelihood with respect to the inverse link function.
:param inv_inv_link_f: latent variables (inverse link of f)
:type inv_inv_link_f: Nx1 array
:param f: latent variables (inverse link of f)
:type f: Nx1 array
:param y: data
:type y: Nx1 array
:param Y_metadata: Y_metadata
@ -225,21 +164,25 @@ if sympy_available:
:rtype: Nx1 array
"""
assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
self._arguments_update(inv_link_f, y)
assert np.atleast_1d(f).shape == np.atleast_1d(y).shape
self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
if self.missing_data:
return np.where(np.isnan(y), self._missing_derivative_code['f'](**self._missing_argments), self._derivative_code['f'](**self._argments))
return np.where(np.isnan(y),
eval(self.code['missing_log_pdf']['derivative']['f_0'], self.namespace),
eval(self.code['log_pdf']['derivative']['f_0'], self.namespace))
else:
return np.where(np.isnan(y), 0., self._derivative_code['f'](**self._arguments))
return np.where(np.isnan(y),
0.,
eval(self.code['log_pdf']['derivative']['f_0'], self.namespace))
def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
def d2logpdf_dlink2(self, f, y, Y_metadata=None):
"""
Hessian of log likelihood given inverse link of latent variables with respect to that inverse link.
i.e. second derivative logpdf at y given inv_link(f_i) and inv_link(f_j) w.r.t inv_link(f_i) and inv_link(f_j).
:param inv_link_f: inverse link of the latent variables.
:type inv_link_f: Nx1 array
:param f: inverse link of the latent variables.
:type f: Nx1 array
:param y: data
:type y: Nx1 array
:param Y_metadata: Y_metadata which is not used in student t distribution
@ -252,52 +195,72 @@ if sympy_available:
distribution for y_i depends only on link(f_i) not on
link(f_(j!=i))
"""
assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
self._arguments_update(inv_link_f, y)
assert np.atleast_1d(f).shape == np.atleast_1d(y).shape
self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
if self.missing_data:
return np.where(np.isnan(y), self._missing_second_derivative_code['f'](**self._missing_argments), self._second_derivative_code['f'](**self._argments))
return np.where(np.isnan(y),
eval(self.code['missing_log_pdf']['second_derivative']['f_0'], self.namespace),
eval(self.code['log_pdf']['second_derivative']['f_0'], self.namespace))
else:
return np.where(np.isnan(y), 0., self._second_derivative_code['f'](**self._arguments))
return np.where(np.isnan(y),
0.,
eval(self.code['log_pdf']['second_derivative']['f_0'], self.namespace))
def d3logpdf_dlink3(self, inv_link_f, y, Y_metadata=None):
assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
self._arguments_update(inv_link_f, y)
def d3logpdf_dlink3(self, f, y, Y_metadata=None):
assert np.atleast_1d(f).shape == np.atleast_1d(y).shape
self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
if self.missing_data:
return np.where(np.isnan(y), self._missing_third_derivative_code['f'](**self._missing_argments), self._third_derivative_code['f'](**self._argments))
return np.where(np.isnan(y),
eval(self.code['missing_log_pdf']['third_derivative']['f_0'], self.namespace),
eval(self.code['log_pdf']['third_derivative']['f_0'], self.namespace))
else:
return np.where(np.isnan(y), 0., self._third_derivative_code['f'](**self._arguments))
return np.where(np.isnan(y),
0.,
eval(self.code['log_pdf']['third_derivative']['f_0'], self.namespace))
def dlogpdf_link_dtheta(self, inv_link_f, y, Y_metadata=None):
assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
self._arguments_update(inv_link_f, y)
g = np.zeros((np.atleast_1d(y).shape[0], len(self._sym_theta)))
for i, theta in enumerate(self._sym_theta):
def dlogpdf_link_dtheta(self, f, y, Y_metadata=None):
assert np.atleast_1d(f).shape == np.atleast_1d(y).shape
self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
g = np.zeros((np.atleast_1d(y).shape[0], len(self.variables['theta'])))
for i, theta in enumerate(self.variables['theta']):
if self.missing_data:
g[:, i:i+1] = np.where(np.isnan(y), self._missing_derivative_code[theta.name](**self._arguments), self._derivative_code[theta.name](**self._arguments))
g[:, i:i+1] = np.where(np.isnan(y),
eval(self.code['missing_log_pdf']['derivative'][theta.name], self.namespace),
eval(self.code['log_pdf']['derivative'][theta.name], self.namespace))
else:
g[:, i:i+1] = np.where(np.isnan(y), 0., self._derivative_code[theta.name](**self._arguments))
g[:, i:i+1] = np.where(np.isnan(y),
0.,
eval(self.code['log_pdf']['derivative'][theta.name], self.namespace))
return g.sum(0)
def dlogpdf_dlink_dtheta(self, inv_link_f, y, Y_metadata=None):
assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
self._arguments_update(inv_link_f, y)
g = np.zeros((np.atleast_1d(y).shape[0], len(self._sym_theta)))
for i, theta in enumerate(self._sym_theta):
def dlogpdf_dlink_dtheta(self, f, y, Y_metadata=None):
assert np.atleast_1d(f).shape == np.atleast_1d(y).shape
self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
g = np.zeros((np.atleast_1d(y).shape[0], len(self.variables['theta'])))
for i, theta in enumerate(self.variables['theta']):
if self.missing_data:
g[:, i:i+1] = np.where(np.isnan(y), self._missing_second_derivative_code[theta.name](**self._arguments), self._second_derivative_code[theta.name](**self._arguments))
g[:, i:i+1] = np.where(np.isnan(y),
eval(self.code['missing_log_pdf']['second_derivative'][theta.name], self.namespace),
eval(self.code['log_pdf']['second_derivative'][theta.name], self.namespace))
else:
g[:, i:i+1] = np.where(np.isnan(y), 0., self._second_derivative_code[theta.name](**self._arguments))
g[:, i:i+1] = np.where(np.isnan(y),
0.,
eval(self.code['log_pdf']['second_derivative'][theta.name], self.namespace))
return g
def d2logpdf_dlink2_dtheta(self, inv_link_f, y, Y_metadata=None):
assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
self._arguments_update(inv_link_f, y)
g = np.zeros((np.atleast_1d(y).shape[0], len(self._sym_theta)))
for i, theta in enumerate(self._sym_theta):
def d2logpdf_dlink2_dtheta(self, f, y, Y_metadata=None):
assert np.atleast_1d(f).shape == np.atleast_1d(y).shape
self.eval_update_cache(F=f, Y=y, Y_metadata=Y_metadata)
g = np.zeros((np.atleast_1d(y).shape[0], len(self.variables['theta'])))
for i, theta in enumerate(self.variables['theta']):
if self.missing_data:
g[:, i:i+1] = np.where(np.isnan(y), self._missing_third_derivative_code[theta.name](**self._arguments), self._third_derivative_code[theta.name](**self._arguments))
g[:, i:i+1] = np.where(np.isnan(y),
eval(self.code['missing_log_pdf']['third_derivative'][theta.name], self.namespace),
eval(self.code['log_pdf']['third_derivative'][theta.name], self.namespace))
else:
g[:, i:i+1] = np.where(np.isnan(y), 0., self._third_derivative_code[theta.name](**self._arguments))
g[:, i:i+1] = np.where(np.isnan(y),
0.,
eval(self.code['log_pdf']['third_derivative'][theta.name], self.namespace))
return g
def predictive_mean(self, mu, sigma, Y_metadata=None):

File diff suppressed because one or more lines are too long