Merge branch 'params' of github.com:SheffieldML/GPy into params

This commit is contained in:
Ricardo 2014-04-17 15:28:13 +01:00
commit b6c4c39261
21 changed files with 955 additions and 587 deletions

View file

@ -65,6 +65,14 @@ class Mapping(Parameterized):
else:
raise NameError, "matplotlib package has not been imported."
class Bijective_mapping(Mapping):
"""This is a mapping that is bijective, i.e. you can go from X to f and also back from f to X. The inverse mapping is called g()."""
def __init__(self, input_dim, output_dim, name='bijective_mapping'):
super(Bijective_apping, self).__init__(name=name)
def g(self, f):
"""Inverse mapping from output domain of the function to the inputs."""
raise NotImplementedError
from model import Model

View file

@ -184,7 +184,7 @@ class ParameterIndexOperationsView(object):
def remove(self, prop, indices):
removed = self._param_index_ops.remove(prop, numpy.array(indices)+self._offset)
if removed.size > 0:
return removed - self._size + 1
return removed-self._offset
return removed

View file

@ -312,7 +312,8 @@ class Indexable(object):
This does not need to account for shaped parameters, as it
basically just sums up the parameter sizes which come before param.
"""
raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?"
return 0
#raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?"
def _raveled_index_for(self, param):
"""
@ -320,7 +321,8 @@ class Indexable(object):
that is an int array, containing the indexes for the flattened
param inside this parameterized logic.
"""
raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"
return param._raveled_index()
#raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?"
class Constrainable(Nameable, Indexable, Observable):
@ -368,10 +370,10 @@ class Constrainable(Nameable, Indexable, Observable):
if value is not None:
self[:] = value
reconstrained = self.unconstrain()
self._add_to_index_operations(self.constraints, reconstrained, __fixed__, warning)
rav_i = self._highest_parent_._raveled_index_for(self)
self._highest_parent_._set_fixed(rav_i)
index = self._add_to_index_operations(self.constraints, reconstrained, __fixed__, warning)
self._highest_parent_._set_fixed(self, index)
self.notify_observers(self, None if trigger_parent else -np.inf)
return index
fix = constrain_fixed
def unconstrain_fixed(self):
@ -379,7 +381,8 @@ class Constrainable(Nameable, Indexable, Observable):
This parameter will no longer be fixed.
"""
unconstrained = self.unconstrain(__fixed__)
self._highest_parent_._set_unfixed(unconstrained)
self._highest_parent_._set_unfixed(self, unconstrained)
return unconstrained
unfix = unconstrain_fixed
def _ensure_fixes(self):
@ -388,14 +391,16 @@ class Constrainable(Nameable, Indexable, Observable):
# Param: ones(self._realsize_
if not self._has_fixes(): self._fixes_ = np.ones(self.size, dtype=bool)
def _set_fixed(self, index):
def _set_fixed(self, param, index):
self._ensure_fixes()
self._fixes_[index] = FIXED
offset = self._offset_for(param)
self._fixes_[index+offset] = FIXED
if np.all(self._fixes_): self._fixes_ = None # ==UNFIXED
def _set_unfixed(self, index):
def _set_unfixed(self, param, index):
self._ensure_fixes()
self._fixes_[index] = UNFIXED
offset = self._offset_for(param)
self._fixes_[index+offset] = UNFIXED
if np.all(self._fixes_): self._fixes_ = None # ==UNFIXED
def _connect_fixes(self):
@ -469,8 +474,9 @@ class Constrainable(Nameable, Indexable, Observable):
"""
self.param_array[...] = transform.initialize(self.param_array)
reconstrained = self.unconstrain()
self._add_to_index_operations(self.constraints, reconstrained, transform, warning)
added = self._add_to_index_operations(self.constraints, reconstrained, transform, warning)
self.notify_observers(self, None if trigger_parent else -np.inf)
return added
def unconstrain(self, *transforms):
"""
@ -549,7 +555,9 @@ class Constrainable(Nameable, Indexable, Observable):
if warning and reconstrained.size > 0:
# TODO: figure out which parameters have changed and only print those
print "WARNING: reconstraining parameters {}".format(self.parameter_names() or self.name)
which.add(what, self._raveled_index())
index = self._raveled_index()
which.add(what, index)
return index
def _remove_from_index_operations(self, which, transforms):
"""
@ -561,9 +569,10 @@ class Constrainable(Nameable, Indexable, Observable):
removed = np.empty((0,), dtype=int)
for t in transforms:
unconstrained = which.remove(t, self._raveled_index())
print unconstrained
removed = np.union1d(removed, unconstrained)
if t is __fixed__:
self._highest_parent_._set_unfixed(unconstrained)
self._highest_parent_._set_unfixed(self, unconstrained)
return removed

349
GPy/core/symbolic.py Normal file
View file

@ -0,0 +1,349 @@
# Copyright (c) 2014, GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import sys
from ..core.parameterization import Parameterized
import numpy as np
import sympy as sym
from ..core.parameterization import Param
from sympy.utilities.lambdify import lambdastr, _imp_namespace, _get_namespace
from sympy.utilities.iterables import numbered_symbols
from sympy import exp
from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma
from GPy.util.functions import normcdf, normcdfln, logistic, logisticln
class Symbolic_core():
"""
Base model symbolic class.
"""
def __init__(self, expression, cacheable, derivatives=None, param=None, func_modules=[]):
# Base class init, do some basic derivatives etc.
# Func_modules sets up the right mapping for functions.
self.func_modules = func_modules
self.func_modules += [{'gamma':gamma,
'gammaln':gammaln,
'erf':erf, 'erfc':erfc,
'erfcx':erfcx,
'polygamma':polygamma,
'normcdf':normcdf,
'normcdfln':normcdfln,
'logistic':logistic,
'logisticln':logisticln},
'numpy']
self.expressions = {}
self.expressions['function'] = expression
self.cacheable = cacheable
# pull the parameters and inputs out of the symbolic pdf
self.variables = {}
vars = [e for e in expression.atoms() if e.is_Symbol]
# inputs are assumed to be those things that are
# cacheable. I.e. those things that aren't stored within the
# object except as cached. For covariance functions this is X
# and Z, for likelihoods F and for mapping functions X.
self.cacheable_vars = [] # list of everything that's cacheable
for var in cacheable:
self.variables[var] = [e for e in vars if e.name.split('_')[0]==var.lower()]
self.cacheable_vars += self.variables[var]
for var in cacheable:
if not self.variables[var]:
raise ValueError('Variable ' + var + ' was specified as cacheable but is not in expression. Expected to find symbols of the form ' + var.lower() + '_0 to represent ' + var)
# things that aren't cacheable are assumed to be parameters.
self.variables['theta'] = sorted([e for e in vars if not e in self.cacheable_vars],key=lambda e:e.name)
# these are arguments for computing derivatives.
derivative_arguments = []
if derivatives is not None:
for derivative in derivatives:
derivative_arguments += self.variables[derivative]
# Do symbolic work to compute derivatives.
self.expressions['derivative'] = {theta.name : sym.diff(self.expressions['function'],theta) for theta in derivative_arguments}
# Add parameters to the model.
for theta in self.variables['theta']:
val = 1.0
# TODO: need to decide how to handle user passing values for the se parameter vectors.
if param is not None:
if param.has_key(theta.name):
val = param[theta.name]
# Add parameter.
setattr(self, theta.name, Param(theta.name, val, None))
self.add_parameters(getattr(self, theta.name))
self.namespace = [globals(), self.__dict__]
self._gen_code()
def eval_parameters_changed(self):
# TODO: place checks for inf/nan in here
# do all the precomputation codes.
for variable, code in sorted(self.code['params_change'].iteritems()):
setattr(self, variable, eval(code, *self.namespace))
self.eval_update_cache()
def eval_update_cache(self, X=None):
# TODO: place checks for inf/nan in here
if X is not None:
for i, theta in enumerate(self.variables['X']):
setattr(self, theta.name, X[:, i][:, None])
for variable, code in sorted(self.code['update_cache'].iteritems()):
setattr(self, variable, eval(code, *self.namespace))
def eval_update_gradients(self, partial, X):
# TODO: place checks for inf/nan in here
for theta in self.variables['theta']:
code = self.code['derivative'][theta.name]
setattr(getattr(self, theta.name),
'gradient',
(partial*eval(code, *self.namespace)).sum())
def eval_gradients_X(self, partial, X):
gradients_X = np.zeros_like(X)
self.eval_update_cache(X)
for i, theta in enumerate(self.variables['X']):
code = self.code['derivative'][theta.name]
gradients_X[:, i:i+1] = partial*eval(code, *self.namespace)
return gradients_X
def eval_f(self, X):
self.eval_update_cache(X)
return eval(self.code['function'], *self.namespace)
def code_parameters_changed(self):
# do all the precomputation codes.
lcode = ''
for variable, code in sorted(self.code['params_change'].iteritems()):
lcode += variable + ' = ' + self._print_code(code) + '\n'
return lcode
def code_update_cache(self):
lcode = 'if X is not None:\n'
for i, theta in enumerate(self.variables['X']):
lcode+= "\t" + self._print_code(theta.name) + ' = X[:, ' + str(i) + "][:, None]\n"
for variable, code in sorted(self.code['update_cache'].iteritems()):
lcode+= self._print_code(variable) + ' = ' + self._print_code(code) + "\n"
return lcode
def code_update_gradients(self):
lcode = ''
for theta in self.variables['theta']:
code = self.code['derivative'][theta.name]
lcode += self._print_code(theta.name) + '.gradient = (partial*(' + self._print_code(code) + ')).sum()\n'
return lcode
def code_gradients_X(self):
lcode = 'gradients_X = np.zeros_like(X)\n'
lcode += 'self.update_cache(X)\n'
for i, theta in enumerate(self.variables['X']):
code = self.code['derivative'][theta.name]
lcode += 'gradients_X[:, ' + str(i) + ':' + str(i) + '+1] = partial*' + self._print_code(code) + '\n'
lcode += 'return gradients_X\n'
return lcode
def code_f(self):
lcode = 'self.update_cache(X)\n'
lcode += 'return ' + self._print_code(self.code['function'])
return lcode
def stabilise(self):
"""Stabilize the code in the model."""
# this code is applied to all expressions in the model in an attempt to sabilize them.
pass
def _gen_namespace(self, modules=None, use_imps=True):
"""Gets the relevant namespaces for the given expressions."""
from sympy.core.symbol import Symbol
# If the user hasn't specified any modules, use what is available.
module_provided = True
if modules is None:
module_provided = False
# Use either numpy (if available) or python.math where possible.
# XXX: This leads to different behaviour on different systems and
# might be the reason for irreproducible errors.
modules = ["math", "mpmath", "sympy"]
try:
_import("numpy")
except ImportError:
pass
else:
modules.insert(1, "numpy")
# Get the needed namespaces.
namespaces = []
# First find any function implementations
if use_imps:
for expr in self._expression_list:
namespaces.append(_imp_namespace(expr))
# Check for dict before iterating
if isinstance(modules, (dict, str)) or not hasattr(modules, '__iter__'):
namespaces.append(modules)
else:
namespaces += list(modules)
# fill namespace with first having highest priority
namespace = {}
for m in namespaces[::-1]:
buf = _get_namespace(m)
namespace.update(buf)
for expr in self._expression_list:
if hasattr(expr, "atoms"):
#Try if you can extract symbols from the expression.
#Move on if expr.atoms in not implemented.
syms = expr.atoms(Symbol)
for term in syms:
namespace.update({str(term): term})
return namespace
def update_expression_list(self):
"""Extract a list of expressions from the dictionary of expressions."""
self.expression_list = [] # code arrives in dictionary, but is passed in this list
self.expression_keys = [] # Keep track of the dictionary keys.
self.expression_order = [] # This may be unecessary. It's to give ordering for cse
for key in self.expressions.keys():
if key == 'function':
self.expression_list.append(self.expressions[key])
self.expression_keys.append([key])
self.expression_order.append(1)
self.code[key] = ''
elif key[-10:] == 'derivative':
self.code[key] = {}
for dkey in self.expressions[key].keys():
self.expression_list.append(self.expressions[key][dkey])
self.expression_keys.append([key, dkey])
if key[:-10] == 'first' or key[:-10] == '':
self.expression_order.append(3) #sym.count_ops(self.expressions[key][dkey]))
elif key[:-10] == 'second':
self.expression_order.append(4) #sym.count_ops(self.expressions[key][dkey]))
elif key[:-10] == 'third':
self.expression_order.append(5) #sym.count_ops(self.expressions[key][dkey]))
self.code[key][dkey] = ''
else:
self.expression_list.append(self.expressions[key])
self.expression_keys.append([key])
self.expression_order.append(2)
self.code[key] = ''
# This step may be unecessary.
# Not 100% sure if the sub expression elimination is order sensitive. This step orders the list with the 'function' code first and derivatives after.
self.expression_order, self.expression_list, self.expression_keys = zip(*sorted(zip(self.expression_order, self.expression_list, self.expression_keys)))
def _gen_code(self, cache_prefix = 'cache', sub_prefix = 'sub', prefix='XoXoXoX'):
"""Generate code for the list of expressions provided using the common sub-expression eliminator to separate out portions that are computed multiple times."""
# This is the dictionary that stores all the generated code.
self.code = {}
# Convert the expressions to a list for common sub expression elimination
# We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'.
self.update_expression_list()
# Apply any global stabilisation operations to expressions.
self.stabilise()
# Helper functions to get data in and out of dictionaries.
# this code from http://stackoverflow.com/questions/14692690/access-python-nested-dictionary-items-via-a-list-of-keys
def getFromDict(dataDict, mapList):
return reduce(lambda d, k: d[k], mapList, dataDict)
def setInDict(dataDict, mapList, value):
getFromDict(dataDict, mapList[:-1])[mapList[-1]] = value
# Do the common sub expression elimination
subexpressions, expression_substituted_list = sym.cse(self.expression_list, numbered_symbols(prefix=prefix))
cacheable_list = []
# Sort out any expression that's dependent on something that scales with data size (these are listed in cacheable).
self.expressions['params_change'] = []
self.expressions['update_cache'] = []
cache_expressions_list = []
sub_expression_list = []
for expr in subexpressions:
arg_list = [e for e in expr[1].atoms() if e.is_Symbol]
cacheable_symbols = [e for e in arg_list if e in cacheable_list or e in self.cacheable_vars]
if cacheable_symbols:
self.expressions['update_cache'].append((expr[0].name, self._expr2code(arg_list, expr[1])))
# list which ensures dependencies are cacheable.
cacheable_list.append(expr[0])
cache_expressions_list.append(expr[0].name)
else:
self.expressions['params_change'].append((expr[0].name, self._expr2code(arg_list, expr[1])))
sub_expression_list.append(expr[0].name)
# Replace original code with code including subexpressions.
for expr, keys in zip(expression_substituted_list, self.expression_keys):
arg_list = [e for e in expr.atoms() if e.is_Symbol]
setInDict(self.code, keys, self._expr2code(arg_list, expr))
setInDict(self.expressions, keys, expr)
# Create variable names for cache and sub expression portions
cache_dict = {}
self.variables[cache_prefix] = []
for i, sub in enumerate(cache_expressions_list):
name = cache_prefix + str(i)
cache_dict[sub] = name
self.variables[cache_prefix].append(sym.var(name))
sub_dict = {}
self.variables[sub_prefix] = []
for i, sub in enumerate(sub_expression_list):
name = sub_prefix + str(i)
sub_dict[sub] = name
self.variables[sub_prefix].append(sym.var(name))
# Replace sub expressions in main code with either cacheN or subN.
for key, val in cache_dict.iteritems():
for keys in self.expression_keys:
setInDict(self.code, keys,
getFromDict(self.code,keys).replace(key, val))
for key, val in sub_dict.iteritems():
for keys in self.expression_keys:
setInDict(self.code, keys,
getFromDict(self.code,keys).replace(key, val))
# Set up precompute code as either cacheN or subN.
self.code['update_cache'] = {}
for key, val in self.expressions['update_cache']:
expr = val
for key2, val2 in cache_dict.iteritems():
expr = expr.replace(key2, val2)
for key2, val2 in sub_dict.iteritems():
expr = expr.replace(key2, val2)
self.code['update_cache'][cache_dict[key]] = expr
self.expressions['update_cache'] = dict(self.expressions['update_cache'])
self.code['params_change'] = {}
for key, val in self.expressions['params_change']:
expr = val
for key2, val2 in cache_dict.iteritems():
expr = expr.replace(key2, val2)
for key2, val2 in sub_dict.iteritems():
expr = expr.replace(key2, val2)
self.code['params_change'][sub_dict[key]] = expr
self.expressions['params_change'] = dict(self.expressions['params_change'])
def _expr2code(self, arg_list, expr):
"""Convert the given symbolic expression into code."""
code = lambdastr(arg_list, expr)
function_code = code.split(':')[1].strip()
#for arg in arg_list:
# function_code = function_code.replace(arg.name, 'self.'+arg.name)
return function_code
def _print_code(self, code):
"""Prepare code for string writing."""
for key in self.variables.keys():
for arg in self.variables[key]:
code = code.replace(arg.name, 'self.'+arg.name)
return code

View file

@ -192,17 +192,22 @@ class VarDTC(object):
class VarDTCMissingData(object):
const_jitter = 1e-6
def __init__(self, limit=1):
def __init__(self, limit=1, inan=None):
from ...util.caching import Cacher
self._Y = Cacher(self._subarray_computations, limit)
self._inan = inan
pass
def set_limit(self, limit):
self._Y.limit = limit
def _subarray_computations(self, Y):
if self._inan is None:
inan = np.isnan(Y)
has_none = inan.any()
else:
inan = self._inan
has_none = True
if has_none:
from ...util.subarray_and_sorting import common_subarrays
self._subarray_indices = []

View file

@ -3,7 +3,6 @@ from _src.rbf import RBF
from _src.linear import Linear, LinearFull
from _src.static import Bias, White
from _src.brownian import Brownian
from _src.symbolic import Symbolic
from _src.stationary import Exponential, Matern32, Matern52, ExpQuad, RatQuad, Cosine
from _src.mlp import MLP
from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52
@ -12,3 +11,15 @@ from _src.coregionalize import Coregionalize
from _src.ssrbf import SSRBF # TODO: ZD: did you remove this?
from _src.ODE_UY import ODE_UY
# TODO: put this in an init file somewhere
try:
import sympy as sym
sympy_available=True
except ImportError:
sympy_available=False
if sympy_available:
from _src.symbolic import Symbolic
#from _src.heat_eqinit import Heat_eqinit
#from _src.ode1_eq_lfm import Ode1_eq_lfm

View file

@ -1,15 +1,17 @@
# Check Matthew Rocklin's blog post.
try:
import sympy as sp
import sympy as sym
sympy_available=True
from sympy.utilities.lambdify import lambdify
from GPy.util.symbolic import stabilise
except ImportError:
sympy_available=False
import numpy as np
from kern import Kern
from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma
from GPy.util.functions import normcdf, normcdfln, logistic, logisticln, differfln
from ...core.parameterization import Param
from ...core.parameterization.transformations import Logexp
class Symbolic(Kern):
"""
@ -26,28 +28,42 @@ class Symbolic(Kern):
- to handle multiple inputs, call them x_1, z_1, etc
- to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j.
"""
def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', param=None, active_dims=None, operators=None):
def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', param=None, active_dims=None, operators=None, func_modules=[]):
if k is None:
raise ValueError, "You must provide an argument for the covariance function."
super(Sympykern, self).__init__(input_dim, active_dims, name)
self._sp_k = k
self.func_modules = func_modules
self.func_modules += [{'gamma':gamma,
'gammaln':gammaln,
'erf':erf, 'erfc':erfc,
'erfcx':erfcx,
'polygamma':polygamma,
'differfln':differfln,
'normcdf':normcdf,
'normcdfln':normcdfln,
'logistic':logistic,
'logisticln':logisticln},
'numpy']
super(Symbolic, self).__init__(input_dim, active_dims, name)
self._sym_k = k
# pull the variable names out of the symbolic covariance function.
sp_vars = [e for e in k.atoms() if e.is_Symbol]
self._sp_x= sorted([e for e in sp_vars if e.name[0:2]=='x_'],key=lambda x:int(x.name[2:]))
self._sp_z= sorted([e for e in sp_vars if e.name[0:2]=='z_'],key=lambda z:int(z.name[2:]))
sym_vars = [e for e in k.atoms() if e.is_Symbol]
self._sym_x= sorted([e for e in sym_vars if e.name[0:2]=='x_'],key=lambda x:int(x.name[2:]))
self._sym_z= sorted([e for e in sym_vars if e.name[0:2]=='z_'],key=lambda z:int(z.name[2:]))
# Check that variable names make sense.
assert all([x.name=='x_%i'%i for i,x in enumerate(self._sp_x)])
assert all([z.name=='z_%i'%i for i,z in enumerate(self._sp_z)])
assert len(self._sp_x)==len(self._sp_z)
x_dim=len(self._sp_x)
assert all([x.name=='x_%i'%i for i,x in enumerate(self._sym_x)])
assert all([z.name=='z_%i'%i for i,z in enumerate(self._sym_z)])
assert len(self._sym_x)==len(self._sym_z)
x_dim=len(self._sym_x)
self._sp_kdiag = k
for x, z in zip(self._sp_x, self._sp_z):
self._sp_kdiag = self._sp_kdiag.subs(z, x)
self._sym_kdiag = k
for x, z in zip(self._sym_x, self._sym_z):
self._sym_kdiag = self._sym_kdiag.subs(z, x)
# If it is a multi-output covariance, add an input for indexing the outputs.
self._real_input_dim = x_dim
@ -56,22 +72,22 @@ class Symbolic(Kern):
self.output_dim = output_dim
# extract parameter names from the covariance
thetas = sorted([e for e in sp_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name)
thetas = sorted([e for e in sym_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name)
# Look for parameters with index (subscripts), they are associated with different outputs.
if self.output_dim>1:
self._sp_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name)
self._sp_theta_j = sorted([e for e in thetas if (e.name[-2:]=='_j')], key=lambda e:e.name)
self._sym_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name)
self._sym_theta_j = sorted([e for e in thetas if (e.name[-2:]=='_j')], key=lambda e:e.name)
# Make sure parameter appears with both indices!
assert len(self._sp_theta_i)==len(self._sp_theta_j)
assert all([theta_i.name[:-2]==theta_j.name[:-2] for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j)])
assert len(self._sym_theta_i)==len(self._sym_theta_j)
assert all([theta_i.name[:-2]==theta_j.name[:-2] for theta_i, theta_j in zip(self._sym_theta_i, self._sym_theta_j)])
# Extract names of shared parameters (those without a subscript)
self._sp_theta = [theta for theta in thetas if theta not in self._sp_theta_i and theta not in self._sp_theta_j]
self._sym_theta = [theta for theta in thetas if theta not in self._sym_theta_i and theta not in self._sym_theta_j]
self.num_split_params = len(self._sp_theta_i)
self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sp_theta_i]
self.num_split_params = len(self._sym_theta_i)
self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sym_theta_i]
# Add split parameters to the model.
for theta in self._split_theta_names:
# TODO: what if user has passed a parameter vector, how should that be stored and interpreted?
@ -79,18 +95,18 @@ class Symbolic(Kern):
self.add_parameter(getattr(self, theta))
self.num_shared_params = len(self._sp_theta)
for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j):
self._sp_kdiag = self._sp_kdiag.subs(theta_j, theta_i)
self.num_shared_params = len(self._sym_theta)
for theta_i, theta_j in zip(self._sym_theta_i, self._sym_theta_j):
self._sym_kdiag = self._sym_kdiag.subs(theta_j, theta_i)
else:
self.num_split_params = 0
self._split_theta_names = []
self._sp_theta = thetas
self.num_shared_params = len(self._sp_theta)
self._sym_theta = thetas
self.num_shared_params = len(self._sym_theta)
# Add parameters to the model.
for theta in self._sp_theta:
for theta in self._sym_theta:
val = 1.0
# TODO: what if user has passed a parameter vector, how should that be stored and interpreted? This is the old way before params class.
if param is not None:
@ -100,25 +116,25 @@ class Symbolic(Kern):
self.add_parameters(getattr(self, theta.name))
# Differentiate with respect to parameters.
derivative_arguments = self._sp_x + self._sp_theta
derivative_arguments = self._sym_x + self._sym_theta
if self.output_dim > 1:
derivative_arguments += self._sp_theta_i
derivative_arguments += self._sym_theta_i
self.derivatives = {theta.name : sp.diff(self._sp_k,theta).simplify() for theta in derivative_arguments}
self.diag_derivatives = {theta.name : sp.diff(self._sp_kdiag,theta).simplify() for theta in derivative_arguments}
self.derivatives = {theta.name : stabilise(sym.diff(self._sym_k,theta)) for theta in derivative_arguments}
self.diag_derivatives = {theta.name : stabilise(sym.diff(self._sym_kdiag,theta)) for theta in derivative_arguments}
# This gives the parameters for the arg list.
self.arg_list = self._sp_x + self._sp_z + self._sp_theta
self.diag_arg_list = self._sp_x + self._sp_theta
self.arg_list = self._sym_x + self._sym_z + self._sym_theta
self.diag_arg_list = self._sym_x + self._sym_theta
if self.output_dim > 1:
self.arg_list += self._sp_theta_i + self._sp_theta_j
self.diag_arg_list += self._sp_theta_i
self.arg_list += self._sym_theta_i + self._sym_theta_j
self.diag_arg_list += self._sym_theta_i
# Check if there are additional linear operators on the covariance.
self._sp_operators = operators
self._sym_operators = operators
# TODO: Deal with linear operators
#if self._sp_operators:
# for operator in self._sp_operators:
#if self._sym_operators:
# for operator in self._sym_operators:
# psi_stats aren't yet implemented.
if False:
@ -128,17 +144,14 @@ class Symbolic(Kern):
self._gen_code()
def __add__(self,other):
return spkern(self._sp_k+other._sp_k)
return spkern(self._sym_k+other._sym_k)
def _gen_code(self):
#fn_theano = theano_function([self.arg_lists], [self._sp_k + self.derivatives], dims={x: 1}, dtypes={x_0: 'float64', z_0: 'float64'})
self._K_function = lambdify(self.arg_list, self._sp_k, 'numpy')
for key in self.derivatives.keys():
setattr(self, '_K_diff_' + key, lambdify(self.arg_list, self.derivatives[key], 'numpy'))
self._Kdiag_function = lambdify(self.diag_arg_list, self._sp_kdiag, 'numpy')
for key in self.derivatives.keys():
setattr(self, '_Kdiag_diff_' + key, lambdify(self.diag_arg_list, self.diag_derivatives[key], 'numpy'))
#fn_theano = theano_function([self.arg_lists], [self._sym_k + self.derivatives], dims={x: 1}, dtypes={x_0: 'float64', z_0: 'float64'})
self._K_function = lambdify(self.arg_list, self._sym_k, self.func_modules)
self._K_derivatives_code = {key: lambdify(self.arg_list, self.derivatives[key], self.func_modules) for key in self.derivatives.keys()}
self._Kdiag_function = lambdify(self.diag_arg_list, self._sym_kdiag, self.func_modules)
self._Kdiag_derivatives_code = {key: lambdify(self.diag_arg_list, self.diag_derivatives[key], self.func_modules) for key in self.diag_derivatives.keys()}
def K(self,X,X2=None):
self._K_computations(X, X2)
@ -156,9 +169,9 @@ class Symbolic(Kern):
def gradients_X(self, dL_dK, X, X2=None):
#if self._X is None or X.base is not self._X.base or X2 is not None:
self._K_computations(X, X2)
gradients_X = np.zeros((X.shape[0], X.shape[1]))
for i, x in enumerate(self._sp_x):
gf = getattr(self, '_K_diff_' + x.name)
gradients_X = np.zeros_like(X)
for i, x in enumerate(self._sym_x):
gf = self._K_derivatives_code[x.name]
gradients_X[:, i] = (gf(**self._arguments)*dL_dK).sum(1)
if X2 is None:
gradients_X *= 2
@ -167,25 +180,25 @@ class Symbolic(Kern):
def gradients_X_diag(self, dL_dK, X):
self._K_computations(X)
dX = np.zeros_like(X)
for i, x in enumerate(self._sp_x):
gf = getattr(self, '_Kdiag_diff_' + x.name)
for i, x in enumerate(self._sym_x):
gf = self._Kdiag_derivatives_code[x.name]
dX[:, i] = gf(**self._diag_arguments)*dL_dK
return dX
def update_gradients_full(self, dL_dK, X, X2=None):
# Need to extract parameters to local variables first
self._K_computations(X, X2)
for theta in self._sp_theta:
for theta in self._sym_theta:
parameter = getattr(self, theta.name)
gf = getattr(self, '_K_diff_' + theta.name)
gf = self._K_derivatives_code[theta.name]
gradient = (gf(**self._arguments)*dL_dK).sum()
if X2 is not None:
gradient += (gf(**self._reverse_arguments)*dL_dK).sum()
setattr(parameter, 'gradient', gradient)
if self.output_dim>1:
for theta in self._sp_theta_i:
for theta in self._sym_theta_i:
parameter = getattr(self, theta.name[:-2])
gf = getattr(self, '_K_diff_' + theta.name)
gf = self._K_derivatives_code[theta.name]
A = gf(**self._arguments)*dL_dK
gradient = np.asarray([A[np.where(self._output_ind==i)].T.sum()
for i in np.arange(self.output_dim)])
@ -200,14 +213,14 @@ class Symbolic(Kern):
def update_gradients_diag(self, dL_dKdiag, X):
self._K_computations(X)
for theta in self._sp_theta:
for theta in self._sym_theta:
parameter = getattr(self, theta.name)
gf = getattr(self, '_Kdiag_diff_' + theta.name)
gf = self._Kdiag_derivatives_code[theta.name]
setattr(parameter, 'gradient', (gf(**self._diag_arguments)*dL_dKdiag).sum())
if self.output_dim>1:
for theta in self._sp_theta_i:
for theta in self._sym_theta_i:
parameter = getattr(self, theta.name[:-2])
gf = getattr(self, '_Kdiag_diff_' + theta.name)
gf = self._Kdiag_derivatives_code[theta.name]
a = gf(**self._diag_arguments)*dL_dKdiag
setattr(parameter, 'gradient',
np.asarray([a[np.where(self._output_ind==i)].sum()
@ -220,40 +233,40 @@ class Symbolic(Kern):
# parameter updates here.
self._arguments = {}
self._diag_arguments = {}
for i, x in enumerate(self._sp_x):
for i, x in enumerate(self._sym_x):
self._arguments[x.name] = X[:, i][:, None]
self._diag_arguments[x.name] = X[:, i][:, None]
if self.output_dim > 1:
self._output_ind = np.asarray(X[:, -1], dtype='int')
for i, theta in enumerate(self._sp_theta_i):
for i, theta in enumerate(self._sym_theta_i):
self._arguments[theta.name] = np.asarray(getattr(self, theta.name[:-2])[self._output_ind])[:, None]
self._diag_arguments[theta.name] = self._arguments[theta.name]
for theta in self._sp_theta:
for theta in self._sym_theta:
self._arguments[theta.name] = np.asarray(getattr(self, theta.name))
self._diag_arguments[theta.name] = self._arguments[theta.name]
if X2 is not None:
for i, z in enumerate(self._sp_z):
for i, z in enumerate(self._sym_z):
self._arguments[z.name] = X2[:, i][None, :]
if self.output_dim > 1:
self._output_ind2 = np.asarray(X2[:, -1], dtype='int')
for i, theta in enumerate(self._sp_theta_j):
for i, theta in enumerate(self._sym_theta_j):
self._arguments[theta.name] = np.asarray(getattr(self, theta.name[:-2])[self._output_ind2])[None, :]
else:
for z in self._sp_z:
for z in self._sym_z:
self._arguments[z.name] = self._arguments['x_'+z.name[2:]].T
if self.output_dim > 1:
self._output_ind2 = self._output_ind
for theta in self._sp_theta_j:
for theta in self._sym_theta_j:
self._arguments[theta.name] = self._arguments[theta.name[:-2] + '_i'].T
if X2 is not None:
# These arguments are needed in gradients when X2 is not equal to X.
self._reverse_arguments = self._arguments
for x, z in zip(self._sp_x, self._sp_z):
for x, z in zip(self._sym_x, self._sym_z):
self._reverse_arguments[x.name] = self._arguments[z.name].T
self._reverse_arguments[z.name] = self._arguments[x.name].T
if self.output_dim > 1:
for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j):
for theta_i, theta_j in zip(self._sym_theta_i, self._sym_theta_j):
self._reverse_arguments[theta_i.name] = self._arguments[theta_j.name].T
self._reverse_arguments[theta_j.name] = self._arguments[theta_i.name].T
@ -265,7 +278,7 @@ if False:
def __init__(self, subkerns, operations, name='sympy_combine'):
super(Symcombine, self).__init__(subkerns, name)
for subkern, operation in zip(subkerns, operations):
self._sp_k += self._k_double_operate(subkern._sp_k, operation)
self._sym_k += self._k_double_operate(subkern._sym_k, operation)
#def _double_operate(self, k, operation):

View file

@ -6,7 +6,7 @@ try:
import sympy as sym
sympy_available=True
from sympy.utilities.lambdify import lambdify
from GPy.util.symbolic import gammaln, ln_cum_gaussian, cum_gaussian
from GPy.util.symbolic import gammaln, logisticln
except ImportError:
sympy_available=False
@ -33,12 +33,14 @@ if sympy_available:
"""
def __init__(self, gp_link=None):
if gp_link is None:
gp_link = link_functions.Log()
gp_link = link_functions.Identity()
dispersion = sym.Symbol('dispersion', positive=True, real=True)
y = sym.Symbol('y', nonnegative=True, integer=True)
f = sym.Symbol('f', positive=True, real=True)
gp_link = link_functions.Log()
log_pdf=dispersion*sym.log(dispersion) - (dispersion+y)*sym.log(dispersion+f) + gammaln(y+dispersion) - gammaln(y+1) - gammaln(dispersion) + y*sym.log(f)
#log_pdf= -(dispersion+y)*logisticln(f-sym.log(dispersion)) + gammaln(y+dispersion) - gammaln(y+1) - gammaln(dispersion) + y*(f-sym.log(dispersion))
super(Negative_binomial, self).__init__(log_pdf=log_pdf, gp_link=gp_link, name='Negative_binomial')
# TODO: Check this.

View file

@ -11,8 +11,8 @@ except ImportError:
import numpy as np
import link_functions
from scipy import stats, integrate
from scipy.special import gammaln, gamma, erf, polygamma
from GPy.util.functions import cum_gaussian, ln_cum_gaussian
from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma
from GPy.util.functions import normcdf, normcdfln, logistic, logisticln
from likelihood import Likelihood
from ..core.parameterization import Param
@ -33,7 +33,17 @@ if sympy_available:
if log_pdf is None:
raise ValueError, "You must provide an argument for the log pdf."
self.func_modules = func_modules + [{'gamma':gamma, 'gammaln':gammaln, 'erf':erf,'polygamma':polygamma, 'cum_gaussian':cum_gaussian, 'ln_cum_gaussian':ln_cum_gaussian}, 'numpy']
self.func_modules = func_modules
self.func_modules += [{'gamma':gamma,
'gammaln':gammaln,
'erf':erf, 'erfc':erfc,
'erfcx':erfcx,
'polygamma':polygamma,
'normcdf':normcdf,
'normcdfln':normcdfln,
'logistic':logistic,
'logisticln':logisticln},
'numpy']
super(Symbolic, self).__init__(gp_link, name=name)
self.missing_data = False
@ -58,7 +68,7 @@ if sympy_available:
sym_vars = [e for e in self._sym_missing_log_pdf.atoms() if e.is_Symbol]
sym_f = [e for e in sym_vars if e.name=='f']
if not sym_f:
raise ValueError('No variable f in missing log pdf.')
raise ValueError('No variable f in missing data log pdf.')
sym_y = [e for e in sym_vars if e.name=='y']
if sym_y:
raise ValueError('Data is present in missing data portion of likelihood.')
@ -74,15 +84,15 @@ if sympy_available:
derivative_arguments = self._sym_f + self._sym_theta
# Do symbolic work to compute derivatives.
self._log_pdf_derivatives = {theta.name : sym.diff(self._sym_log_pdf,theta).simplify() for theta in derivative_arguments}
self._log_pdf_second_derivatives = {theta.name : sym.diff(self._log_pdf_derivatives['f'],theta).simplify() for theta in derivative_arguments}
self._log_pdf_third_derivatives = {theta.name : sym.diff(self._log_pdf_second_derivatives['f'],theta).simplify() for theta in derivative_arguments}
self._log_pdf_derivatives = {theta.name : stabilise(sym.diff(self._sym_log_pdf,theta)) for theta in derivative_arguments}
self._log_pdf_second_derivatives = {theta.name : stabilise(sym.diff(self._log_pdf_derivatives['f'],theta)) for theta in derivative_arguments}
self._log_pdf_third_derivatives = {theta.name : stabilise(sym.diff(self._log_pdf_second_derivatives['f'],theta)) for theta in derivative_arguments}
if self.missing_data:
# Do symbolic work to compute derivatives.
self._missing_log_pdf_derivatives = {theta.name : sym.diff(self._sym_missing_log_pdf,theta).simplify() for theta in derivative_arguments}
self._missing_log_pdf_second_derivatives = {theta.name : sym.diff(self._missing_log_pdf_derivatives['f'],theta).simplify() for theta in derivative_arguments}
self._missing_log_pdf_third_derivatives = {theta.name : sym.diff(self._missing_log_pdf_second_derivatives['f'],theta).simplify() for theta in derivative_arguments}
self._missing_log_pdf_derivatives = {theta.name : stabilise(sym.diff(self._sym_missing_log_pdf,theta)) for theta in derivative_arguments}
self._missing_log_pdf_second_derivatives = {theta.name : stabilise(sym.diff(self._missing_log_pdf_derivatives['f'],theta)) for theta in derivative_arguments}
self._missing_log_pdf_third_derivatives = {theta.name : stabilise(sym.diff(self._missing_log_pdf_second_derivatives['f'],theta)) for theta in derivative_arguments}
# Add parameters to the model.
@ -96,7 +106,7 @@ if sympy_available:
self.add_parameters(getattr(self, theta.name))
# Is there some way to check whether the pdf is log
# TODO: Is there an easy way to check whether the pdf is log
# concave? For the moment, need user to specify.
self.log_concave = log_concave
@ -106,22 +116,30 @@ if sympy_available:
# generate the code for the pdf and derivatives
self._gen_code()
def list_functions(self):
"""Return a list of all symbolic functions in the model and their names."""
def _gen_code(self):
"""Generate the code from the symbolic parts that will be used for likleihod computation."""
# TODO: Check here whether theano is available and set up
# functions accordingly.
self._log_pdf_function = lambdify(self.arg_list, self._sym_log_pdf, self.func_modules)
# compute code for derivatives (for implicit likelihood terms
# we need up to 3rd derivatives)
setattr(self, '_first_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_derivatives[key], self.func_modules) for key in self._log_pdf_derivatives.keys()})
setattr(self, '_second_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_second_derivatives[key], self.func_modules) for key in self._log_pdf_second_derivatives.keys()})
setattr(self, '_third_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_third_derivatives[key], self.func_modules) for key in self._log_pdf_third_derivatives.keys()})
symbolic_functions = [self._sym_log_pdf]
deriv_list = [self._log_pdf_derivatives, self._log_pdf_second_derivatives, self._log_pdf_third_derivatives]
symbolic_functions += [deriv[key] for key in sorted(deriv.keys()) for deriv in deriv_list]
if self.missing_data:
setattr(self, '_missing_first_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_derivatives[key], self.func_modules) for key in self._missing_log_pdf_derivatives.keys()})
setattr(self, '_missing_second_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_second_derivatives[key], self.func_modules) for key in self._missing_log_pdf_second_derivatives.keys()})
setattr(self, '_missing_third_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_third_derivatives[key], self.func_modules) for key in self._missing_log_pdf_third_derivatives.keys()})
symbolic_functions+=[self._sym_missing_log_pdf]
deriv_list = [self._missing_log_pdf_derivatives, self._missing_log_pdf_second_derivatives, self._missing_log_pdf_third_derivatives]
symbolic_functions += [deriv[key] for key in sorted(deriv.keys()) for deriv in deriv_list]
# self._log_pdf_function = lambdify(self.arg_list, self._sym_log_pdf, self.func_modules)
# # compute code for derivatives
# self._derivative_code = {key: lambdify(self.arg_list, self._log_pdf_derivatives[key], self.func_modules) for key in self._log_pdf_derivatives.keys()}
# self._second_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_second_derivatives[key], self.func_modules) for key in self._log_pdf_second_derivatives.keys()}
# self._third_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_third_derivatives[key], self.func_modules) for key in self._log_pdf_third_derivatives.keys()}
# if self.missing_data:
# self._missing_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_derivatives[key], self.func_modules) for key in self._missing_log_pdf_derivatives.keys()}
# self._missing_second_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_second_derivatives[key], self.func_modules) for key in self._missing_log_pdf_second_derivatives.keys()}
# self._missing_third_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_third_derivatives[key], self.func_modules) for key in self._missing_log_pdf_third_derivatives.keys()}
# TODO: compute EP code parts based on logZ. We need dlogZ/dmu, d2logZ/dmu2 and dlogZ/dtheta
@ -210,9 +228,9 @@ if sympy_available:
assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape
self._arguments_update(inv_link_f, y)
if self.missing_data:
return np.where(np.isnan(y), self._missing_first_derivative_code['f'](**self._missing_argments), self._first_derivative_code['f'](**self._argments))
return np.where(np.isnan(y), self._missing_derivative_code['f'](**self._missing_argments), self._derivative_code['f'](**self._argments))
else:
return np.where(np.isnan(y), 0., self._first_derivative_code['f'](**self._arguments))
return np.where(np.isnan(y), 0., self._derivative_code['f'](**self._arguments))
def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None):
"""
@ -255,9 +273,9 @@ if sympy_available:
g = np.zeros((np.atleast_1d(y).shape[0], len(self._sym_theta)))
for i, theta in enumerate(self._sym_theta):
if self.missing_data:
g[:, i:i+1] = np.where(np.isnan(y), self._missing_first_derivative_code[theta.name](**self._arguments), self._first_derivative_code[theta.name](**self._arguments))
g[:, i:i+1] = np.where(np.isnan(y), self._missing_derivative_code[theta.name](**self._arguments), self._derivative_code[theta.name](**self._arguments))
else:
g[:, i:i+1] = np.where(np.isnan(y), 0., self._first_derivative_code[theta.name](**self._arguments))
g[:, i:i+1] = np.where(np.isnan(y), 0., self._derivative_code[theta.name](**self._arguments))
return g.sum(0)
def dlogpdf_dlink_dtheta(self, inv_link_f, y, Y_metadata=None):

View file

@ -1,7 +1,17 @@
# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
# Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
from kernel import Kernel
from linear import Linear
from mlp import MLP
#from rbf import RBF
# TODO need to fix this in a config file.
try:
import sympy as sym
sympy_available=True
except ImportError:
sympy_available=False
if sympy_available:
# These are likelihoods that rely on symbolic.
from symbolic import Symbolic

View file

@ -1,11 +1,11 @@
# Copyright (c) 2013, GPy authors (see AUTHORS.txt).
# Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt).
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import numpy as np
from ..core.mapping import Mapping
from ..core.mapping import Bijective_mapping
from ..core.parameterization import Param
class Linear(Mapping):
class Linear(Bijective_mapping):
"""
Mapping based on a linear model.
@ -20,8 +20,8 @@ class Linear(Mapping):
"""
def __init__(self, input_dim=1, output_dim=1, name='linear_map'):
Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
def __init__(self, input_dim=1, output_dim=1, name='linear'):
Bijective_mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name)
self.W = Param('W',np.array((self.input_dim, self.output_dim)))
self.bias = Param('bias',np.array(self.output_dim))
self.add_parameters(self.W, self.bias)
@ -29,10 +29,15 @@ class Linear(Mapping):
def f(self, X):
return np.dot(X,self.W) + self.bias
def g(self, f):
V = np.linalg.solve(np.dot(self.W.T, self.W), W.T)
return np.dot(f-self.bias, V)
def df_dtheta(self, dL_df, X):
df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T
df_dbias = (dL_df.sum(0))
return np.hstack((df_dW.flatten(), df_dbias))
def dL_dX(self, dL_df, X):
return (dL_df[:, None, :]*self.W[None, :, :]).sum(2)
def dL_dX(self, partial, X):
"""The gradient of L with respect to the inputs to the mapping, where L is a function that is dependent on the output of the mapping, f."""
return (partial[:, None, :]*self.W[None, :, :]).sum(2)

56
GPy/mappings/symbolic.py Normal file
View file

@ -0,0 +1,56 @@
# Copyright (c) 2014 GPy Authors
# Licensed under the BSD 3-clause license (see LICENSE.txt)
import sympy as sym
from ..core.mapping import Mapping, Bijective_mapping
from ..core.symbolic import Symbolic_core
import numpy as np
class Symbolic(Mapping, Symbolic_core):
"""
Symbolic mapping
Mapping where the form of the mapping is provided by a sympy expression.
"""
def __init__(self, input_dim, output_dim, f=None, name='symbolic', param=None, func_modules=[]):
if f is None:
raise ValueError, "You must provide an argument for the function."
Mapping.__init__(self, input_dim, output_dim, name=name)
Symbolic_core.__init__(self, f, ['X'], derivatives = ['X', 'theta'], param=param, func_modules=func_modules)
self._initialize_cache()
self.parameters_changed()
def _initialize_cache(self):
self.x_0 = np.random.normal(size=(3, self.input_dim))
def parameters_changed(self):
self.eval_parameters_changed()
def update_cache(self, X):
self.eval_update_cache(X)
def update_gradients(self, partial, X):
self.eval_update_gradients(partial, X)
def gradients_X(self, partial, X):
return self.eval_gradients_X(partial, X)
def f(self, X):
"""
"""
return self.eval_f(X)
def df_dX(self, X):
"""
"""
pass
def df_dtheta(self, X):
pass

View file

@ -24,12 +24,14 @@ class Test(unittest.TestCase):
self.assertDictEqual(self.param_index._properties, {})
def test_remove(self):
self.param_index.remove(three, np.r_[3:10])
removed = self.param_index.remove(three, np.r_[3:10])
self.assertListEqual(removed.tolist(), [4, 7])
self.assertListEqual(self.param_index[three].tolist(), [2])
self.param_index.remove(one, [1])
removed = self.param_index.remove(one, [1])
self.assertListEqual(removed.tolist(), [])
self.assertListEqual(self.param_index[one].tolist(), [3])
self.assertListEqual(self.param_index.remove('not in there', []).tolist(), [])
self.param_index.remove(one, [9])
removed = self.param_index.remove(one, [9])
self.assertListEqual(self.param_index[one].tolist(), [3])
self.assertListEqual(self.param_index.remove('not in there', [2,3,4]).tolist(), [])
@ -78,6 +80,13 @@ class Test(unittest.TestCase):
self.assertEqual(i, i2)
self.assertTrue(np.all(v == v2))
def test_indexview_remove(self):
removed = self.view.remove(two, [3])
self.assertListEqual(removed.tolist(), [3])
removed = self.view.remove(three, np.r_[:5])
self.assertListEqual(removed.tolist(), [0, 2])
def test_misc(self):
for k,v in self.param_index.copy()._properties.iteritems():
self.assertListEqual(self.param_index[k].tolist(), v.tolist())

View file

@ -9,8 +9,6 @@ from GPy.core.parameterization.param import Param
verbose = 0
np.random.seed(50)
class Kern_check_model(GPy.core.Model):
"""

View file

@ -153,6 +153,18 @@ class ParameterizedTest(unittest.TestCase):
self.testmodel.randomize()
np.testing.assert_equal(variances, self.testmodel['.*var'].values())
def test_fix_unfix(self):
fixed = self.testmodel.kern.lengthscale.fix()
self.assertListEqual(fixed.tolist(), [0])
unfixed = self.testmodel.kern.lengthscale.unfix()
self.testmodel.kern.lengthscale.constrain_positive()
self.assertListEqual(unfixed.tolist(), [0])
fixed = self.testmodel.kern.fix()
self.assertListEqual(fixed.tolist(), [0,1])
unfixed = self.testmodel.kern.unfix()
self.assertListEqual(unfixed.tolist(), [0,1])
def test_printing(self):
print self.test1
print self.param

File diff suppressed because one or more lines are too long

View file

@ -106,9 +106,30 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code))
elif response.code > 499:
raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
# if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
with open(save_name, 'wb') as f:
f.write(response.read())
meta = response.info()
file_size = int(meta.getheaders("Content-Length")[0])
status = ""
file_size_dl = 0
block_sz = 8192
line_length=30
while True:
buff = response.read(block_sz)
if not buff:
break
file_size_dl += len(buff)
f.write(buff)
sys.stdout.write(" "*(len(status)) + "\r")
status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6),
full=file_size/(1.*1e6), ll=line_length,
perc="="*int(line_length*float(file_size_dl)/file_size))
sys.stdout.write(status)
sys.stdout.flush()
sys.stdout.write(" "*(len(status)) + "\r")
print status
# if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
#with open(save_name, 'wb') as f:
# f.write(response.read())
#urllib.urlretrieve(url+suffix, save_name, reporthook)
@ -304,8 +325,8 @@ def football_data(season='1314', data_set='football_data'):
data_set_season = data_set + '_' + season
data_resources[data_set_season] = copy.deepcopy(data_resources[data_set])
data_resources[data_set_season]['urls'][0]+=season + '/'
start_year = int(year[0:2])
end_year = int(year[2:4])
start_year = int(season[0:2])
end_year = int(season[2:4])
files = ['E0.csv', 'E1.csv', 'E2.csv', 'E3.csv']
if start_year>4 and start_year < 93:
files += ['EC.csv']
@ -552,6 +573,151 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0):
c = c[so, :]
return {'Y':Y, 't':t, 'colors':c}
def hapmap3(data_set='hapmap3'):
"""
The HapMap phase three SNP dataset - 1184 samples out of 11 populations.
SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:
Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then
/ 1, iff SNPij==(B1,B1)
Aij = | 0, iff SNPij==(B1,B2)
\ -1, iff SNPij==(B2,B2)
The SNP data and the meta information (such as iid, sex and phenotype) are
stored in the dataframe datadf, index is the Individual ID,
with following columns for metainfo:
* family_id -> Family ID
* paternal_id -> Paternal ID
* maternal_id -> Maternal ID
* sex -> Sex (1=male; 2=female; other=unknown)
* phenotype -> Phenotype (-9, or 0 for unknown)
* population -> Population string (e.g. 'ASW' - 'YRI')
* rest are SNP rs (ids)
More information is given in infodf:
* Chromosome:
- autosomal chromosemes -> 1-22
- X X chromosome -> 23
- Y Y chromosome -> 24
- XY Pseudo-autosomal region of X -> 25
- MT Mitochondrial -> 26
* Relative Positon (to Chromosome) [base pairs]
"""
try:
from pandas import read_pickle, DataFrame
from sys import stdout
import bz2
except ImportError as i:
raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
if not data_available(data_set):
download_data(data_set)
dirpath = os.path.join(data_path,'hapmap3')
hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
['.snps.pickle',
'.info.pickle',
'.nan.pickle']]
if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
"of memory and can take a (very) long time, continue? [Y/n]"):
print "Preprocessing required for further usage."
return
status = "Preprocessing data, please be patient..."
print status
def write_status(message, progress, status):
stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
perc="="*int(20.*progress/100.))
stdout.write(status); stdout.flush()
return status
unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
status=write_status('unpacking...', 0, '')
curr = 0
for newfilepath in unpacked_files:
if not os.path.exists(newfilepath):
filepath = newfilepath + '.bz2'
file_size = os.path.getsize(filepath)
with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f:
decomp = bz2.BZ2Decompressor()
file_processed = 0
buffsize = 100 * 1024
for data in iter(lambda : f.read(buffsize), b''):
new_file.write(decomp.decompress(data))
file_processed += len(data)
status=write_status('unpacking...', curr+12.*file_processed/(file_size), status)
curr += 12
status=write_status('unpacking...', curr, status)
status=write_status('reading .ped...', 25, status)
# Preprocess data:
snpstrnp = np.loadtxt(unpacked_files[0], dtype=str)
status=write_status('reading .map...', 33, status)
mapnp = np.loadtxt(unpacked_files[1], dtype=str)
status=write_status('reading relationships.txt...', 42, status)
# and metainfo:
infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t')
infodf.set_index('IID', inplace=1)
status=write_status('filtering nan...', 45, status)
snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
inan = snpstr[:,:,0] == '0'
status=write_status('filtering reference alleles...', 55, status)
ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:]))
status=write_status('encoding snps...', 70, status)
# Encode the information for each gene in {-1,0,1}:
status=write_status('encoding snps...', 73, status)
snps = (snpstr==ref[None,:,:])
status=write_status('encoding snps...', 76, status)
snps = (snps*np.array([1,-1])[None,None,:])
status=write_status('encoding snps...', 78, status)
snps = snps.sum(-1)
status=write_status('encoding snps...', 81, status)
snps = snps.astype('i8')
status=write_status('marking nan values...', 88, status)
# put in nan values (masked as -128):
snps[inan] = -128
status=write_status('setting up meta...', 94, status)
# get meta information:
metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']]
metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6])
metadf.set_index('iid', inplace=1)
metadf = metadf.join(infodf.population)
metadf.to_pickle(preprocessed_data_paths[1])
# put everything together:
status=write_status('setting up snps...', 96, status)
snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1])
with open(preprocessed_data_paths[0], 'wb') as f:
pickle.dump(f, snpsdf, protocoll=-1)
status=write_status('setting up snps...', 98, status)
inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
inandf.to_pickle(preprocessed_data_paths[2])
status=write_status('done :)', 100, status)
print ''
else:
print "loading snps..."
snpsdf = read_pickle(preprocessed_data_paths[0])
print "loading metainfo..."
metadf = read_pickle(preprocessed_data_paths[1])
print "loading nan entries..."
inandf = read_pickle(preprocessed_data_paths[2])
snps = snpsdf.values
populations = metadf.population.values.astype('S3')
hapmap = dict(name=data_set,
description='The HapMap phase three SNP dataset - '
'1184 samples out of 11 populations. inan is a '
'boolean array, containing wheather or not the '
'given entry is nan (nans are masked as '
'-128 in snps).',
snpsdf=snpsdf,
metadf=metadf,
snps=snps,
inan=inandf.values,
inandf=inandf,
populations=populations)
return hapmap
def swiss_roll_1000():
return swiss_roll(num_samples=1000)

View file

@ -25,8 +25,8 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
'size' : 1100584},
'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
'files' : [['allasfamc.zip']],
'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
The database was created with funding from NSF EIA-0196217.""",
'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'
'The database was created with funding from NSF EIA-0196217.""",
'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
'size' : None},
@ -120,8 +120,49 @@ The database was created with funding from NSF EIA-0196217.""",
'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
'license' : None,
'size' : 3410}
'size' : 3410},
'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'],
'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']],
'details' : """
HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations.
The HapMap phase three SNP dataset - 1184 samples out of 11 populations.
See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.
SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:
Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then
/ 1, iff SNPij==(B1,B1)
Aij = | 0, iff SNPij==(B1,B2)
\ -1, iff SNPij==(B2,B2)
The SNP data and the meta information (such as iid, sex and phenotype) are
stored in the dataframe datadf, index is the Individual ID,
with following columns for metainfo:
* family_id -> Family ID
* paternal_id -> Paternal ID
* maternal_id -> Maternal ID
* sex -> Sex (1=male; 2=female; other=unknown)
* phenotype -> Phenotype (-9, or 0 for unknown)
* population -> Population string (e.g. 'ASW' - 'YRI')
* rest are SNP rs (ids)
More information is given in infodf:
* Chromosome:
- autosomal chromosemes -> 1-22
- X X chromosome -> 23
- Y Y chromosome -> 24
- XY Pseudo-autosomal region of X -> 25
- MT Mitochondrial -> 26
* Relative Positon (to Chromosome) [base pairs]
""",
'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""",
'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""",
'size' : 2*1729092237 + 62265},
}
with open('data_resources.json', 'w') as file:
json.dump(data_resources, file)
with open('data_resources.json', 'w') as f:
print "writing data_resources"
json.dump(data_resources, f)

View file

@ -1,18 +1,25 @@
import numpy as np
from scipy.special import erf, erfcx
from scipy.special import erf, erfc, erfcx
import sys
epsilon = sys.float_info.epsilon
lim_val = -np.log(epsilon)
def cum_gaussian(x):
g=0.5*(1+erf(x/np.sqrt(2)))
def logisticln(x):
return np.where(x<lim_val, np.where(x>-lim_val, -np.log(1+np.exp(-x)), -x), -np.log(1+epsilon))
def logistic(x):
return np.where(x<lim_val, np.where(x>-lim_val, 1/(1+np.exp(-x)), epsilon/(epsilon+1)), 1/(1+epsilon))
def normcdf(x):
g=0.5*erfc(-x/np.sqrt(2))
return np.where(g==0, epsilon, np.where(g==1, 1-epsilon, g))
def ln_cum_gaussian(x):
return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-np.sqrt(2)/2*x)), np.log(cum_gaussian(x)))
def normcdfln(x):
return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-x/np.sqrt(2))), np.log(normcdf(x)))
def clip_exp(x):
if any(x>=lim_val) or any(x<=-lim_val):
return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val))
else:
return np.exp(x)
return np.where(x<lim_val, np.where(x>-lim_val, np.exp(x), epsilon), 1/epsilon)
def differfln(x0, x1):
# this is a, hopefully!, a numerically more stable variant of log(erf(x0)-erf(x1)) = log(erfc(x1)-erfc(x0)).
return np.where(x0>x1, -x1*x1 + np.log(erfcx(x1)-np.exp(-x0**2+x1**2)*erfcx(x0)), -x0*x0 + np.log(np.exp(-x1**2+x0**2)*erfcx(x1) - erfcx(x0)))

View file

@ -1,6 +1,54 @@
from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma,polygamma
import sys
import numpy as np
import sympy as sym
from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma, polygamma
class logistic(Function):
"""The logistic function as a symbolic function."""
nargs = 1
def fdiff(self, argindex=1):
x = self.args[0]
return logistic(x)*(1-logistic(x))
@classmethod
def eval(cls, x):
if x.is_Number:
return 1/(1+exp(-x))
class logisticln(Function):
"""The log logistic, which can often be computed with more precision than the simply taking log(logistic(x)) when x is small or large."""
nargs = 1
def fdiff(self, argindex=1):
x = self.args[0]
return 1-logistic(x)
@classmethod
def eval(cls, x):
if x.is_Number:
return -np.log(1+exp(-x))
class erfc(Function):
"""The complementary error function, erfc(x) = 1-erf(x). Used as a helper function, particularly for erfcx, the scaled complementary error function. and the normal distributions cdf."""
nargs = 1
@classmethod
def eval(cls, arg):
return 1-erf(arg)
class erfcx(Function):
nargs = 1
def fdiff(self, argindex=1):
x = self.args[0]
return x*erfcx(x)-2/sqrt(pi)
@classmethod
def eval(cls, x):
if x.is_Number:
return exp(x**2)*erfc(x)
class gammaln(Function):
"""The log of the gamma function, which is often needed instead of log(gamma(x)) for better accuracy for large x."""
nargs = 1
def fdiff(self, argindex=1):
@ -13,22 +61,26 @@ class gammaln(Function):
return log(gamma(x))
class ln_cum_gaussian(Function):
class normcdfln(Function):
"""The log of the normal cdf. Can often be computed with better accuracy than log(normcdf(x)), particulary when x is either small or large."""
nargs = 1
def fdiff(self, argindex=1):
x = self.args[0]
return exp(-ln_cum_gaussian(x) - 0.5*x*x)/sqrt(2*pi)
#return -erfcx(-x/sqrt(2))/sqrt(2*pi)
#return exp(-normcdfln(x) - 0.5*x*x)/sqrt(2*pi)
return sqrt(2/pi)*1/erfcx(-x/sqrt(2))
@classmethod
def eval(cls, x):
if x.is_Number:
return log(cum_gaussian(x))
return log(normcdf(x))
def _eval_is_real(self):
return self.args[0].is_real
class cum_gaussian(Function):
class normcdf(Function):
"""The cumulative distribution function of the standard normal. Provided as a convenient helper function. It is computed throught -0.5*erfc(-x/sqrt(2))."""
nargs = 1
def fdiff(self, argindex=1):
x = self.args[0]
@ -37,12 +89,30 @@ class cum_gaussian(Function):
@classmethod
def eval(cls, x):
if x.is_Number:
return 0.5*(1+erf(sqrt(2)/2*x))
return 0.5*(erfc(-x/sqrt(2)))
def _eval_is_real(self):
return self.args[0].is_real
class gaussian(Function):
class normalln(Function):
"""The log of the standard normal distribution."""
nargs = 1
def fdiff(self, argindex=1):
x = self.args[0]
return -x
@classmethod
def eval(cls, x):
if x.is_Number:
return 0.5*sqrt(2*pi) - 0.5*x*x
def _eval_is_real(self):
return True
class normal(Function):
"""The standard normal distribution. Provided as a convenience function."""
nargs = 1
@classmethod
def eval(cls, x):
@ -51,23 +121,23 @@ class gaussian(Function):
def _eval_is_real(self):
return True
class ln_diff_erf(Function):
class differfln(Function):
nargs = 2
def fdiff(self, argindex=2):
if argindex == 2:
x0, x1 = self.args
return -2*exp(-x1**2)/(sqrt(pi)*(erf(x0)-erf(x1)))
return -2/(sqrt(pi)*(erfcx(x1)-exp(x1**2-x0**2)*erfcx(x0)))
elif argindex == 1:
x0, x1 = self.args
return 2.*exp(-x0**2)/(sqrt(pi)*(erf(x0)-erf(x1)))
return 2/(sqrt(pi)*(exp(x0**2-x1**2)*erfcx(x1)-erfcx(x0)))
else:
raise ArgumentIndexError(self, argindex)
@classmethod
def eval(cls, x0, x1):
if x0.is_Number and x1.is_Number:
return log(erf(x0)-erf(x1))
return log(erfc(x1)-erfc(x0))
class dh_dd_i(Function):
nargs = 5
@ -232,7 +302,6 @@ class h(Function):
@classmethod
def eval(cls, t, tprime, d_i, d_j, l):
# putting in the is_Number stuff forces it to look for a fdiff method for derivative. If it's left out, then when asking for self.diff, it just does the diff on the eval symbolic terms directly. We want to avoid that because we are looking to ensure everything is numerically stable. Maybe it's because of the if statement that this happens?
if (t.is_Number
and tprime.is_Number
and d_i.is_Number
@ -273,17 +342,5 @@ class h(Function):
# *(erf(tprime/l - d_j/2.*l)
# + erf(d_j/2.*l))))
class erfc(Function):
nargs = 1
@classmethod
def eval(cls, arg):
return 1-erf(arg)
class erfcx(Function):
nargs = 1
@classmethod
def eval(cls, arg):
return erfc(arg)*exp(arg*arg)