From 9b5a1edb23061b16c79bcb9961f120a210053ca9 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Mon, 7 Apr 2014 10:31:13 +0200 Subject: [PATCH 1/7] Partial changes to symbolic, including adding mapping covariance and beginning to unify code generation. --- GPy/core/mapping.py | 8 ++ GPy/kern/__init__.py | 12 ++- GPy/kern/_src/symbolic.py | 154 +++++++++++++++------------ GPy/likelihoods/negative_binomial.py | 6 +- GPy/likelihoods/symbolic.py | 56 ++++++---- GPy/mappings/__init__.py | 12 ++- GPy/mappings/linear.py | 19 ++-- GPy/util/functions.py | 21 ++-- GPy/util/symbolic.py | 96 +++++++++++++---- 9 files changed, 252 insertions(+), 132 deletions(-) diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py index 6eaaf96c..8cfd9ad0 100644 --- a/GPy/core/mapping.py +++ b/GPy/core/mapping.py @@ -65,6 +65,14 @@ class Mapping(Parameterized): else: raise NameError, "matplotlib package has not been imported." +class Bijective_mapping(Mapping): + """This is a mapping that is bijective, i.e. you can go from X to f and also back from f to X. The inverse mapping is called g().""" + def __init__(self, input_dim, output_dim, name='bijective_mapping'): + super(Bijective_apping, self).__init__(name=name) + + def g(self, f): + """Inverse mapping from output domain of the function to the inputs.""" + raise NotImplementedError from model import Model diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index 37cd71ec..a04c16a5 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -3,7 +3,6 @@ from _src.rbf import RBF from _src.linear import Linear, LinearFull from _src.static import Bias, White from _src.brownian import Brownian -from _src.symbolic import Symbolic from _src.stationary import Exponential, Matern32, Matern52, ExpQuad, RatQuad, Cosine from _src.mlp import MLP from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52 @@ -12,3 +11,14 @@ from _src.coregionalize import Coregionalize from _src.ssrbf import SSRBF # TODO: ZD: did you remove this? from _src.ODE_UY import ODE_UY +# TODO: put this in an init file somewhere +try: + import sympy as sym + sympy_available=True +except ImportError: + sympy_available=False + +if sympy_available: + from _src.symbolic import Symbolic + from _src.heat_eqinit import Heat_eqinit + diff --git a/GPy/kern/_src/symbolic.py b/GPy/kern/_src/symbolic.py index c7bbae73..79abd91d 100644 --- a/GPy/kern/_src/symbolic.py +++ b/GPy/kern/_src/symbolic.py @@ -1,15 +1,17 @@ # Check Matthew Rocklin's blog post. try: - import sympy as sp + import sympy as sym sympy_available=True from sympy.utilities.lambdify import lambdify + from GPy.util.symbolic import stabilise except ImportError: sympy_available=False import numpy as np from kern import Kern +from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma +from GPy.util.functions import normcdf, normcdfln, logistic, logisticln from ...core.parameterization import Param -from ...core.parameterization.transformations import Logexp class Symbolic(Kern): """ @@ -26,28 +28,41 @@ class Symbolic(Kern): - to handle multiple inputs, call them x_1, z_1, etc - to handle multpile correlated outputs, you'll need to add parameters with an index, such as lengthscale_i and lengthscale_j. """ - def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', param=None, active_dims=None, operators=None): + def __init__(self, input_dim, k=None, output_dim=1, name='symbolic', param=None, active_dims=None, operators=None, func_modules=[]): if k is None: raise ValueError, "You must provide an argument for the covariance function." - super(Sympykern, self).__init__(input_dim, active_dims, name) - self._sp_k = k + self.func_modules = func_modules + self.func_modules += [{'gamma':gamma, + 'gammaln':gammaln, + 'erf':erf, 'erfc':erfc, + 'erfcx':erfcx, + 'polygamma':polygamma, + 'normcdf':normcdf, + 'normcdfln':normcdfln, + 'logistic':logistic, + 'logisticln':logisticln}, + 'numpy'] + + super(Symbolic, self).__init__(input_dim, active_dims, name) + + self._sym_k = k # pull the variable names out of the symbolic covariance function. - sp_vars = [e for e in k.atoms() if e.is_Symbol] - self._sp_x= sorted([e for e in sp_vars if e.name[0:2]=='x_'],key=lambda x:int(x.name[2:])) - self._sp_z= sorted([e for e in sp_vars if e.name[0:2]=='z_'],key=lambda z:int(z.name[2:])) + sym_vars = [e for e in k.atoms() if e.is_Symbol] + self._sym_x= sorted([e for e in sym_vars if e.name[0:2]=='x_'],key=lambda x:int(x.name[2:])) + self._sym_z= sorted([e for e in sym_vars if e.name[0:2]=='z_'],key=lambda z:int(z.name[2:])) # Check that variable names make sense. - assert all([x.name=='x_%i'%i for i,x in enumerate(self._sp_x)]) - assert all([z.name=='z_%i'%i for i,z in enumerate(self._sp_z)]) - assert len(self._sp_x)==len(self._sp_z) - x_dim=len(self._sp_x) + assert all([x.name=='x_%i'%i for i,x in enumerate(self._sym_x)]) + assert all([z.name=='z_%i'%i for i,z in enumerate(self._sym_z)]) + assert len(self._sym_x)==len(self._sym_z) + x_dim=len(self._sym_x) - self._sp_kdiag = k - for x, z in zip(self._sp_x, self._sp_z): - self._sp_kdiag = self._sp_kdiag.subs(z, x) + self._sym_kdiag = k + for x, z in zip(self._sym_x, self._sym_z): + self._sym_kdiag = self._sym_kdiag.subs(z, x) # If it is a multi-output covariance, add an input for indexing the outputs. self._real_input_dim = x_dim @@ -56,22 +71,22 @@ class Symbolic(Kern): self.output_dim = output_dim # extract parameter names from the covariance - thetas = sorted([e for e in sp_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name) + thetas = sorted([e for e in sym_vars if not (e.name[0:2]=='x_' or e.name[0:2]=='z_')],key=lambda e:e.name) # Look for parameters with index (subscripts), they are associated with different outputs. if self.output_dim>1: - self._sp_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name) - self._sp_theta_j = sorted([e for e in thetas if (e.name[-2:]=='_j')], key=lambda e:e.name) + self._sym_theta_i = sorted([e for e in thetas if (e.name[-2:]=='_i')], key=lambda e:e.name) + self._sym_theta_j = sorted([e for e in thetas if (e.name[-2:]=='_j')], key=lambda e:e.name) # Make sure parameter appears with both indices! - assert len(self._sp_theta_i)==len(self._sp_theta_j) - assert all([theta_i.name[:-2]==theta_j.name[:-2] for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j)]) + assert len(self._sym_theta_i)==len(self._sym_theta_j) + assert all([theta_i.name[:-2]==theta_j.name[:-2] for theta_i, theta_j in zip(self._sym_theta_i, self._sym_theta_j)]) # Extract names of shared parameters (those without a subscript) - self._sp_theta = [theta for theta in thetas if theta not in self._sp_theta_i and theta not in self._sp_theta_j] + self._sym_theta = [theta for theta in thetas if theta not in self._sym_theta_i and theta not in self._sym_theta_j] - self.num_split_params = len(self._sp_theta_i) - self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sp_theta_i] + self.num_split_params = len(self._sym_theta_i) + self._split_theta_names = ["%s"%theta.name[:-2] for theta in self._sym_theta_i] # Add split parameters to the model. for theta in self._split_theta_names: # TODO: what if user has passed a parameter vector, how should that be stored and interpreted? @@ -79,18 +94,18 @@ class Symbolic(Kern): self.add_parameter(getattr(self, theta)) - self.num_shared_params = len(self._sp_theta) - for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j): - self._sp_kdiag = self._sp_kdiag.subs(theta_j, theta_i) + self.num_shared_params = len(self._sym_theta) + for theta_i, theta_j in zip(self._sym_theta_i, self._sym_theta_j): + self._sym_kdiag = self._sym_kdiag.subs(theta_j, theta_i) else: self.num_split_params = 0 self._split_theta_names = [] - self._sp_theta = thetas - self.num_shared_params = len(self._sp_theta) + self._sym_theta = thetas + self.num_shared_params = len(self._sym_theta) # Add parameters to the model. - for theta in self._sp_theta: + for theta in self._sym_theta: val = 1.0 # TODO: what if user has passed a parameter vector, how should that be stored and interpreted? This is the old way before params class. if param is not None: @@ -100,25 +115,25 @@ class Symbolic(Kern): self.add_parameters(getattr(self, theta.name)) # Differentiate with respect to parameters. - derivative_arguments = self._sp_x + self._sp_theta + derivative_arguments = self._sym_x + self._sym_theta if self.output_dim > 1: - derivative_arguments += self._sp_theta_i + derivative_arguments += self._sym_theta_i - self.derivatives = {theta.name : sp.diff(self._sp_k,theta).simplify() for theta in derivative_arguments} - self.diag_derivatives = {theta.name : sp.diff(self._sp_kdiag,theta).simplify() for theta in derivative_arguments} + self.derivatives = {theta.name : stabilise(sym.diff(self._sym_k,theta)) for theta in derivative_arguments} + self.diag_derivatives = {theta.name : stabilise(sym.diff(self._sym_kdiag,theta)) for theta in derivative_arguments} # This gives the parameters for the arg list. - self.arg_list = self._sp_x + self._sp_z + self._sp_theta - self.diag_arg_list = self._sp_x + self._sp_theta + self.arg_list = self._sym_x + self._sym_z + self._sym_theta + self.diag_arg_list = self._sym_x + self._sym_theta if self.output_dim > 1: - self.arg_list += self._sp_theta_i + self._sp_theta_j - self.diag_arg_list += self._sp_theta_i + self.arg_list += self._sym_theta_i + self._sym_theta_j + self.diag_arg_list += self._sym_theta_i # Check if there are additional linear operators on the covariance. - self._sp_operators = operators + self._sym_operators = operators # TODO: Deal with linear operators - #if self._sp_operators: - # for operator in self._sp_operators: + #if self._sym_operators: + # for operator in self._sym_operators: # psi_stats aren't yet implemented. if False: @@ -128,17 +143,14 @@ class Symbolic(Kern): self._gen_code() def __add__(self,other): - return spkern(self._sp_k+other._sp_k) + return spkern(self._sym_k+other._sym_k) def _gen_code(self): - #fn_theano = theano_function([self.arg_lists], [self._sp_k + self.derivatives], dims={x: 1}, dtypes={x_0: 'float64', z_0: 'float64'}) - self._K_function = lambdify(self.arg_list, self._sp_k, 'numpy') - for key in self.derivatives.keys(): - setattr(self, '_K_diff_' + key, lambdify(self.arg_list, self.derivatives[key], 'numpy')) - - self._Kdiag_function = lambdify(self.diag_arg_list, self._sp_kdiag, 'numpy') - for key in self.derivatives.keys(): - setattr(self, '_Kdiag_diff_' + key, lambdify(self.diag_arg_list, self.diag_derivatives[key], 'numpy')) + #fn_theano = theano_function([self.arg_lists], [self._sym_k + self.derivatives], dims={x: 1}, dtypes={x_0: 'float64', z_0: 'float64'}) + self._K_function = lambdify(self.arg_list, self._sym_k, self.func_modules) + self._K_derivatives_code = {key: lambdify(self.arg_list, self.derivatives[key], self.func_modules) for key in self.derivatives.keys()} + self._Kdiag_function = lambdify(self.diag_arg_list, self._sym_kdiag, self.func_modules) + self._Kdiag_derivatives_code = {key: lambdify(self.diag_arg_list, self.diag_derivatives[key], self.func_modules) for key in self.diag_derivatives.keys()} def K(self,X,X2=None): self._K_computations(X, X2) @@ -157,8 +169,8 @@ class Symbolic(Kern): #if self._X is None or X.base is not self._X.base or X2 is not None: self._K_computations(X, X2) gradients_X = np.zeros((X.shape[0], X.shape[1])) - for i, x in enumerate(self._sp_x): - gf = getattr(self, '_K_diff_' + x.name) + for i, x in enumerate(self._sym_x): + gf = self._K_derivatives_code[x.name] gradients_X[:, i] = (gf(**self._arguments)*dL_dK).sum(1) if X2 is None: gradients_X *= 2 @@ -167,25 +179,25 @@ class Symbolic(Kern): def gradients_X_diag(self, dL_dK, X): self._K_computations(X) dX = np.zeros_like(X) - for i, x in enumerate(self._sp_x): - gf = getattr(self, '_Kdiag_diff_' + x.name) + for i, x in enumerate(self._sym_x): + gf = self._Kdiag_derivatives_code[x.name] dX[:, i] = gf(**self._diag_arguments)*dL_dK return dX def update_gradients_full(self, dL_dK, X, X2=None): # Need to extract parameters to local variables first self._K_computations(X, X2) - for theta in self._sp_theta: + for theta in self._sym_theta: parameter = getattr(self, theta.name) - gf = getattr(self, '_K_diff_' + theta.name) + gf = self._K_derivatives_code[theta.name] gradient = (gf(**self._arguments)*dL_dK).sum() if X2 is not None: gradient += (gf(**self._reverse_arguments)*dL_dK).sum() setattr(parameter, 'gradient', gradient) if self.output_dim>1: - for theta in self._sp_theta_i: + for theta in self._sym_theta_i: parameter = getattr(self, theta.name[:-2]) - gf = getattr(self, '_K_diff_' + theta.name) + gf = self._K_derivatives_code[theta.name] A = gf(**self._arguments)*dL_dK gradient = np.asarray([A[np.where(self._output_ind==i)].T.sum() for i in np.arange(self.output_dim)]) @@ -200,14 +212,14 @@ class Symbolic(Kern): def update_gradients_diag(self, dL_dKdiag, X): self._K_computations(X) - for theta in self._sp_theta: + for theta in self._sym_theta: parameter = getattr(self, theta.name) - gf = getattr(self, '_Kdiag_diff_' + theta.name) + gf = self._Kdiag_derivatives_code[theta.name] setattr(parameter, 'gradient', (gf(**self._diag_arguments)*dL_dKdiag).sum()) if self.output_dim>1: - for theta in self._sp_theta_i: + for theta in self._sym_theta_i: parameter = getattr(self, theta.name[:-2]) - gf = getattr(self, '_Kdiag_diff_' + theta.name) + gf = self._Kdiag_derivatives_code[theta.name] a = gf(**self._diag_arguments)*dL_dKdiag setattr(parameter, 'gradient', np.asarray([a[np.where(self._output_ind==i)].sum() @@ -220,40 +232,40 @@ class Symbolic(Kern): # parameter updates here. self._arguments = {} self._diag_arguments = {} - for i, x in enumerate(self._sp_x): + for i, x in enumerate(self._sym_x): self._arguments[x.name] = X[:, i][:, None] self._diag_arguments[x.name] = X[:, i][:, None] if self.output_dim > 1: self._output_ind = np.asarray(X[:, -1], dtype='int') - for i, theta in enumerate(self._sp_theta_i): + for i, theta in enumerate(self._sym_theta_i): self._arguments[theta.name] = np.asarray(getattr(self, theta.name[:-2])[self._output_ind])[:, None] self._diag_arguments[theta.name] = self._arguments[theta.name] - for theta in self._sp_theta: + for theta in self._sym_theta: self._arguments[theta.name] = np.asarray(getattr(self, theta.name)) self._diag_arguments[theta.name] = self._arguments[theta.name] if X2 is not None: - for i, z in enumerate(self._sp_z): + for i, z in enumerate(self._sym_z): self._arguments[z.name] = X2[:, i][None, :] if self.output_dim > 1: self._output_ind2 = np.asarray(X2[:, -1], dtype='int') - for i, theta in enumerate(self._sp_theta_j): + for i, theta in enumerate(self._sym_theta_j): self._arguments[theta.name] = np.asarray(getattr(self, theta.name[:-2])[self._output_ind2])[None, :] else: - for z in self._sp_z: + for z in self._sym_z: self._arguments[z.name] = self._arguments['x_'+z.name[2:]].T if self.output_dim > 1: self._output_ind2 = self._output_ind - for theta in self._sp_theta_j: + for theta in self._sym_theta_j: self._arguments[theta.name] = self._arguments[theta.name[:-2] + '_i'].T if X2 is not None: # These arguments are needed in gradients when X2 is not equal to X. self._reverse_arguments = self._arguments - for x, z in zip(self._sp_x, self._sp_z): + for x, z in zip(self._sym_x, self._sym_z): self._reverse_arguments[x.name] = self._arguments[z.name].T self._reverse_arguments[z.name] = self._arguments[x.name].T if self.output_dim > 1: - for theta_i, theta_j in zip(self._sp_theta_i, self._sp_theta_j): + for theta_i, theta_j in zip(self._sym_theta_i, self._sym_theta_j): self._reverse_arguments[theta_i.name] = self._arguments[theta_j.name].T self._reverse_arguments[theta_j.name] = self._arguments[theta_i.name].T @@ -265,7 +277,7 @@ if False: def __init__(self, subkerns, operations, name='sympy_combine'): super(Symcombine, self).__init__(subkerns, name) for subkern, operation in zip(subkerns, operations): - self._sp_k += self._k_double_operate(subkern._sp_k, operation) + self._sym_k += self._k_double_operate(subkern._sym_k, operation) #def _double_operate(self, k, operation): diff --git a/GPy/likelihoods/negative_binomial.py b/GPy/likelihoods/negative_binomial.py index 5bc5b727..1130375b 100644 --- a/GPy/likelihoods/negative_binomial.py +++ b/GPy/likelihoods/negative_binomial.py @@ -6,7 +6,7 @@ try: import sympy as sym sympy_available=True from sympy.utilities.lambdify import lambdify - from GPy.util.symbolic import gammaln, ln_cum_gaussian, cum_gaussian + from GPy.util.symbolic import gammaln, logisticln except ImportError: sympy_available=False @@ -33,12 +33,14 @@ if sympy_available: """ def __init__(self, gp_link=None): if gp_link is None: - gp_link = link_functions.Log() + gp_link = link_functions.Identity() dispersion = sym.Symbol('dispersion', positive=True, real=True) y = sym.Symbol('y', nonnegative=True, integer=True) f = sym.Symbol('f', positive=True, real=True) + gp_link = link_functions.Log() log_pdf=dispersion*sym.log(dispersion) - (dispersion+y)*sym.log(dispersion+f) + gammaln(y+dispersion) - gammaln(y+1) - gammaln(dispersion) + y*sym.log(f) + #log_pdf= -(dispersion+y)*logisticln(f-sym.log(dispersion)) + gammaln(y+dispersion) - gammaln(y+1) - gammaln(dispersion) + y*(f-sym.log(dispersion)) super(Negative_binomial, self).__init__(log_pdf=log_pdf, gp_link=gp_link, name='Negative_binomial') # TODO: Check this. diff --git a/GPy/likelihoods/symbolic.py b/GPy/likelihoods/symbolic.py index 5d052119..65c90183 100644 --- a/GPy/likelihoods/symbolic.py +++ b/GPy/likelihoods/symbolic.py @@ -5,14 +5,15 @@ try: import sympy as sym sympy_available=True from sympy.utilities.lambdify import lambdify + from GPy.util.symbolic import stabilise except ImportError: sympy_available=False import numpy as np import link_functions from scipy import stats, integrate -from scipy.special import gammaln, gamma, erf, polygamma -from GPy.util.functions import cum_gaussian, ln_cum_gaussian +from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma +from GPy.util.functions import normcdf, normcdfln, logistic, logisticln from likelihood import Likelihood from ..core.parameterization import Param @@ -33,7 +34,17 @@ if sympy_available: if log_pdf is None: raise ValueError, "You must provide an argument for the log pdf." - self.func_modules = func_modules + [{'gamma':gamma, 'gammaln':gammaln, 'erf':erf,'polygamma':polygamma, 'cum_gaussian':cum_gaussian, 'ln_cum_gaussian':ln_cum_gaussian}, 'numpy'] + self.func_modules = func_modules + self.func_modules += [{'gamma':gamma, + 'gammaln':gammaln, + 'erf':erf, 'erfc':erfc, + 'erfcx':erfcx, + 'polygamma':polygamma, + 'normcdf':normcdf, + 'normcdfln':normcdfln, + 'logistic':logistic, + 'logisticln':logisticln}, + 'numpy'] super(Symbolic, self).__init__(gp_link, name=name) self.missing_data = False @@ -58,7 +69,7 @@ if sympy_available: sym_vars = [e for e in self._sym_missing_log_pdf.atoms() if e.is_Symbol] sym_f = [e for e in sym_vars if e.name=='f'] if not sym_f: - raise ValueError('No variable f in missing log pdf.') + raise ValueError('No variable f in missing data log pdf.') sym_y = [e for e in sym_vars if e.name=='y'] if sym_y: raise ValueError('Data is present in missing data portion of likelihood.') @@ -74,15 +85,15 @@ if sympy_available: derivative_arguments = self._sym_f + self._sym_theta # Do symbolic work to compute derivatives. - self._log_pdf_derivatives = {theta.name : sym.diff(self._sym_log_pdf,theta).simplify() for theta in derivative_arguments} - self._log_pdf_second_derivatives = {theta.name : sym.diff(self._log_pdf_derivatives['f'],theta).simplify() for theta in derivative_arguments} - self._log_pdf_third_derivatives = {theta.name : sym.diff(self._log_pdf_second_derivatives['f'],theta).simplify() for theta in derivative_arguments} + self._log_pdf_derivatives = {theta.name : stabilise(sym.diff(self._sym_log_pdf,theta)) for theta in derivative_arguments} + self._log_pdf_second_derivatives = {theta.name : stabilise(sym.diff(self._log_pdf_derivatives['f'],theta)) for theta in derivative_arguments} + self._log_pdf_third_derivatives = {theta.name : stabilise(sym.diff(self._log_pdf_second_derivatives['f'],theta)) for theta in derivative_arguments} if self.missing_data: # Do symbolic work to compute derivatives. - self._missing_log_pdf_derivatives = {theta.name : sym.diff(self._sym_missing_log_pdf,theta).simplify() for theta in derivative_arguments} - self._missing_log_pdf_second_derivatives = {theta.name : sym.diff(self._missing_log_pdf_derivatives['f'],theta).simplify() for theta in derivative_arguments} - self._missing_log_pdf_third_derivatives = {theta.name : sym.diff(self._missing_log_pdf_second_derivatives['f'],theta).simplify() for theta in derivative_arguments} + self._missing_log_pdf_derivatives = {theta.name : stabilise(sym.diff(self._sym_missing_log_pdf,theta)) for theta in derivative_arguments} + self._missing_log_pdf_second_derivatives = {theta.name : stabilise(sym.diff(self._missing_log_pdf_derivatives['f'],theta)) for theta in derivative_arguments} + self._missing_log_pdf_third_derivatives = {theta.name : stabilise(sym.diff(self._missing_log_pdf_second_derivatives['f'],theta)) for theta in derivative_arguments} # Add parameters to the model. @@ -96,7 +107,7 @@ if sympy_available: self.add_parameters(getattr(self, theta.name)) - # Is there some way to check whether the pdf is log + # TODO: Is there an easy way to check whether the pdf is log # concave? For the moment, need user to specify. self.log_concave = log_concave @@ -112,16 +123,15 @@ if sympy_available: # functions accordingly. self._log_pdf_function = lambdify(self.arg_list, self._sym_log_pdf, self.func_modules) - # compute code for derivatives (for implicit likelihood terms - # we need up to 3rd derivatives) - setattr(self, '_first_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_derivatives[key], self.func_modules) for key in self._log_pdf_derivatives.keys()}) - setattr(self, '_second_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_second_derivatives[key], self.func_modules) for key in self._log_pdf_second_derivatives.keys()}) - setattr(self, '_third_derivative_code', {key: lambdify(self.arg_list, self._log_pdf_third_derivatives[key], self.func_modules) for key in self._log_pdf_third_derivatives.keys()}) + # compute code for derivatives + self._derivative_code = {key: lambdify(self.arg_list, self._log_pdf_derivatives[key], self.func_modules) for key in self._log_pdf_derivatives.keys()} + self._second_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_second_derivatives[key], self.func_modules) for key in self._log_pdf_second_derivatives.keys()} + self._third_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_third_derivatives[key], self.func_modules) for key in self._log_pdf_third_derivatives.keys()} if self.missing_data: - setattr(self, '_missing_first_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_derivatives[key], self.func_modules) for key in self._missing_log_pdf_derivatives.keys()}) - setattr(self, '_missing_second_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_second_derivatives[key], self.func_modules) for key in self._missing_log_pdf_second_derivatives.keys()}) - setattr(self, '_missing_third_derivative_code', {key: lambdify(self.arg_list, self._missing_log_pdf_third_derivatives[key], self.func_modules) for key in self._missing_log_pdf_third_derivatives.keys()}) + self._missing_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_derivatives[key], self.func_modules) for key in self._missing_log_pdf_derivatives.keys()} + self._missing_second_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_second_derivatives[key], self.func_modules) for key in self._missing_log_pdf_second_derivatives.keys()} + self._missing_third_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_third_derivatives[key], self.func_modules) for key in self._missing_log_pdf_third_derivatives.keys()} # TODO: compute EP code parts based on logZ. We need dlogZ/dmu, d2logZ/dmu2 and dlogZ/dtheta @@ -210,9 +220,9 @@ if sympy_available: assert np.atleast_1d(inv_link_f).shape == np.atleast_1d(y).shape self._arguments_update(inv_link_f, y) if self.missing_data: - return np.where(np.isnan(y), self._missing_first_derivative_code['f'](**self._missing_argments), self._first_derivative_code['f'](**self._argments)) + return np.where(np.isnan(y), self._missing_derivative_code['f'](**self._missing_argments), self._derivative_code['f'](**self._argments)) else: - return np.where(np.isnan(y), 0., self._first_derivative_code['f'](**self._arguments)) + return np.where(np.isnan(y), 0., self._derivative_code['f'](**self._arguments)) def d2logpdf_dlink2(self, inv_link_f, y, Y_metadata=None): """ @@ -255,9 +265,9 @@ if sympy_available: g = np.zeros((np.atleast_1d(y).shape[0], len(self._sym_theta))) for i, theta in enumerate(self._sym_theta): if self.missing_data: - g[:, i:i+1] = np.where(np.isnan(y), self._missing_first_derivative_code[theta.name](**self._arguments), self._first_derivative_code[theta.name](**self._arguments)) + g[:, i:i+1] = np.where(np.isnan(y), self._missing_derivative_code[theta.name](**self._arguments), self._derivative_code[theta.name](**self._arguments)) else: - g[:, i:i+1] = np.where(np.isnan(y), 0., self._first_derivative_code[theta.name](**self._arguments)) + g[:, i:i+1] = np.where(np.isnan(y), 0., self._derivative_code[theta.name](**self._arguments)) return g.sum(0) def dlogpdf_dlink_dtheta(self, inv_link_f, y, Y_metadata=None): diff --git a/GPy/mappings/__init__.py b/GPy/mappings/__init__.py index 97573aba..33f1d6b0 100644 --- a/GPy/mappings/__init__.py +++ b/GPy/mappings/__init__.py @@ -1,7 +1,17 @@ -# Copyright (c) 2013, GPy authors (see AUTHORS.txt). +# Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) from kernel import Kernel from linear import Linear from mlp import MLP #from rbf import RBF +# TODO need to fix this in a config file. +try: + import sympy as sym + sympy_available=True +except ImportError: + sympy_available=False + +if sympy_available: + # These are likelihoods that rely on symbolic. + from symbolic import Symbolic diff --git a/GPy/mappings/linear.py b/GPy/mappings/linear.py index 075b8556..24a45511 100644 --- a/GPy/mappings/linear.py +++ b/GPy/mappings/linear.py @@ -1,11 +1,11 @@ -# Copyright (c) 2013, GPy authors (see AUTHORS.txt). +# Copyright (c) 2013, 2014 GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) import numpy as np -from ..core.mapping import Mapping +from ..core.mapping import Bijective_mapping from ..core.parameterization import Param -class Linear(Mapping): +class Linear(Bijective_mapping): """ Mapping based on a linear model. @@ -20,8 +20,8 @@ class Linear(Mapping): """ - def __init__(self, input_dim=1, output_dim=1, name='linear_map'): - Mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) + def __init__(self, input_dim=1, output_dim=1, name='linear'): + Bijective_mapping.__init__(self, input_dim=input_dim, output_dim=output_dim, name=name) self.W = Param('W',np.array((self.input_dim, self.output_dim))) self.bias = Param('bias',np.array(self.output_dim)) self.add_parameters(self.W, self.bias) @@ -29,10 +29,15 @@ class Linear(Mapping): def f(self, X): return np.dot(X,self.W) + self.bias + def g(self, f): + V = np.linalg.solve(np.dot(self.W.T, self.W), W.T) + return np.dot(f-self.bias, V) + def df_dtheta(self, dL_df, X): df_dW = (dL_df[:, :, None]*X[:, None, :]).sum(0).T df_dbias = (dL_df.sum(0)) return np.hstack((df_dW.flatten(), df_dbias)) - def dL_dX(self, dL_df, X): - return (dL_df[:, None, :]*self.W[None, :, :]).sum(2) + def dL_dX(self, partial, X): + """The gradient of L with respect to the inputs to the mapping, where L is a function that is dependent on the output of the mapping, f.""" + return (partial[:, None, :]*self.W[None, :, :]).sum(2) diff --git a/GPy/util/functions.py b/GPy/util/functions.py index a9ee1084..1cb1f7cb 100644 --- a/GPy/util/functions.py +++ b/GPy/util/functions.py @@ -1,18 +1,21 @@ import numpy as np -from scipy.special import erf, erfcx +from scipy.special import erf, erfc, erfcx import sys epsilon = sys.float_info.epsilon lim_val = -np.log(epsilon) -def cum_gaussian(x): - g=0.5*(1+erf(x/np.sqrt(2))) +def logisticln(x): + return np.where(x-lim_val, -np.log(1+np.exp(-x)), -x), -np.log(1+epsilon)) + +def logistic(x): + return np.where(x-lim_val, 1/(1+np.exp(-x)), epsilon/(epsilon+1)), 1/(1+epsilon)) + +def normcdf(x): + g=0.5*erfc(-x/np.sqrt(2)) return np.where(g==0, epsilon, np.where(g==1, 1-epsilon, g)) -def ln_cum_gaussian(x): - return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-np.sqrt(2)/2*x)), np.log(cum_gaussian(x))) +def normcdfln(x): + return np.where(x < 0, -.5*x*x + np.log(.5) + np.log(erfcx(-x/np.sqrt(2))), np.log(normcdf(x))) def clip_exp(x): - if any(x>=lim_val) or any(x<=-lim_val): - return np.where(x-lim_val, np.exp(x), np.exp(-lim_val)), np.exp(lim_val)) - else: - return np.exp(x) + return np.where(x-lim_val, np.exp(x), epsilon), 1/epsilon) diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py index 5b3ac312..31aa1375 100644 --- a/GPy/util/symbolic.py +++ b/GPy/util/symbolic.py @@ -1,6 +1,56 @@ +import sympy as sym from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma,polygamma +def stabilise(e): + """Attempt to find the most numerically stable form of an expression""" + return e #sym.expand(e) + +class logistic(Function): + """The logistic function as a symbolic function.""" + nargs = 1 + def fdiff(self, argindex=1): + x = self.args[0] + return logistic(x)*(1-logistic(x)) + + @classmethod + def eval(cls, x): + if x.is_Number: + return 1/(1+exp(-x)) + +class logisticln(Function): + """The log logistic, which can often be computed with more precision than the simply taking log(logistic(x)) when x is small or large.""" + nargs = 1 + + def fdiff(self, argindex=1): + x = self.args[0] + return 1-logistic(x) + + @classmethod + def eval(cls, x): + if x.is_Number: + return -np.log(1+exp(-x)) + +class erfc(Function): + """The complementary error function, erfc(x) = 1-erf(x). Used as a helper function, particularly for erfcx, the scaled complementary error function. and the normal distributions cdf.""" + nargs = 1 + + @classmethod + def eval(cls, arg): + return 1-erf(arg) + +class erfcx(Function): + nargs = 1 + def fdiff(self, argindex=1): + x = self.args[0] + return x*erfcx(x)-2/sqrt(pi) + + @classmethod + def eval(cls, x): + if x.is_Number: + return exp(x**2)*erfc(x) + class gammaln(Function): + """The log of the gamma function, which is often needed instead of log(gamma(x)) for better accuracy for large x.""" nargs = 1 def fdiff(self, argindex=1): @@ -13,22 +63,26 @@ class gammaln(Function): return log(gamma(x)) -class ln_cum_gaussian(Function): +class normcdfln(Function): + """The log of the normal cdf. Can often be computed with better accuracy than log(normcdf(x)), particulary when x is either small or large.""" nargs = 1 def fdiff(self, argindex=1): x = self.args[0] - return exp(-ln_cum_gaussian(x) - 0.5*x*x)/sqrt(2*pi) + #return -erfcx(-x/sqrt(2))/sqrt(2*pi) + #return exp(-normcdfln(x) - 0.5*x*x)/sqrt(2*pi) + return sqrt(2/pi)*1/erfcx(-x/sqrt(2)) @classmethod def eval(cls, x): if x.is_Number: - return log(cum_gaussian(x)) + return log(normcdf(x)) def _eval_is_real(self): return self.args[0].is_real -class cum_gaussian(Function): +class normcdf(Function): + """The cumulative distribution function of the standard normal. Provided as a convenient helper function. It is computed throught -0.5*erfc(-x/sqrt(2)).""" nargs = 1 def fdiff(self, argindex=1): x = self.args[0] @@ -37,12 +91,30 @@ class cum_gaussian(Function): @classmethod def eval(cls, x): if x.is_Number: - return 0.5*(1+erf(sqrt(2)/2*x)) + return 0.5*(erfc(-x/sqrt(2))) def _eval_is_real(self): return self.args[0].is_real -class gaussian(Function): +class normalln(Function): + """The log of the standard normal distribution.""" + nargs = 1 + def fdiff(self, argindex=1): + x = self.args[0] + return -x + + @classmethod + def eval(cls, x): + if x.is_Number: + return 0.5*sqrt(2*pi) - 0.5*x*x + + + def _eval_is_real(self): + return True + + +class normal(Function): + """The standard normal distribution. Provided as a convenience function.""" nargs = 1 @classmethod def eval(cls, x): @@ -273,17 +345,5 @@ class h(Function): # *(erf(tprime/l - d_j/2.*l) # + erf(d_j/2.*l)))) -class erfc(Function): - nargs = 1 - - @classmethod - def eval(cls, arg): - return 1-erf(arg) -class erfcx(Function): - nargs = 1 - - @classmethod - def eval(cls, arg): - return erfc(arg)*exp(arg*arg) From 41ef7f4c72819ef19577eb3f4eb22c75c21ee5f9 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 8 Apr 2014 06:09:30 +0100 Subject: [PATCH 2/7] Ongoing changes to symbolic. --- GPy/core/mapping.py | 12 +- GPy/core/parameterization/observable_array.py | 137 ------------------ GPy/core/parameterization/param.py | 2 +- GPy/kern/__init__.py | 1 + GPy/kern/_src/symbolic.py | 3 +- GPy/likelihoods/symbolic.py | 29 ++-- GPy/util/functions.py | 4 + GPy/util/symbolic.py | 55 ++++++- 8 files changed, 77 insertions(+), 166 deletions(-) delete mode 100644 GPy/core/parameterization/observable_array.py diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py index 8cfd9ad0..efd9476f 100644 --- a/GPy/core/mapping.py +++ b/GPy/core/mapping.py @@ -34,7 +34,7 @@ class Mapping(Parameterized): raise NotImplementedError def df_dtheta(self, dL_df, X): - """The gradient of the outputs of the mapping with respect to each of the parameters. + """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters. :param dL_df: gradient of the objective with respect to the function. :type dL_df: ndarray (num_data x output_dim) @@ -50,7 +50,7 @@ class Mapping(Parameterized): """ Plots the mapping associated with the model. - In one dimension, the function is plotted. - - In two dimensions, a contour-plot shows the function + - In two dimsensions, a contour-plot shows the function - In higher dimensions, we've not implemented this yet !TODO! Can plot only part of the data and part of the posterior functions @@ -65,14 +65,6 @@ class Mapping(Parameterized): else: raise NameError, "matplotlib package has not been imported." -class Bijective_mapping(Mapping): - """This is a mapping that is bijective, i.e. you can go from X to f and also back from f to X. The inverse mapping is called g().""" - def __init__(self, input_dim, output_dim, name='bijective_mapping'): - super(Bijective_apping, self).__init__(name=name) - - def g(self, f): - """Inverse mapping from output domain of the function to the inputs.""" - raise NotImplementedError from model import Model diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py deleted file mode 100644 index fc9d6cf2..00000000 --- a/GPy/core/parameterization/observable_array.py +++ /dev/null @@ -1,137 +0,0 @@ -# Copyright (c) 2012, GPy authors (see AUTHORS.txt). -# Licensed under the BSD 3-clause license (see LICENSE.txt) - -__updated__ = '2014-03-31' - -import numpy as np -from parameter_core import Observable, Pickleable - -class ObsAr(np.ndarray, Pickleable, Observable): - """ - An ndarray which reports changes to its observers. - The observers can add themselves with a callable, which - will be called every time this array changes. The callable - takes exactly one argument, which is this array itself. - """ - __array_priority__ = -1 # Never give back ObsAr - def __new__(cls, input_array, *a, **kw): - if not isinstance(input_array, ObsAr): - obj = np.atleast_1d(np.require(input_array, dtype=np.float64, requirements=['W', 'C'])).view(cls) - else: obj = input_array - #cls.__name__ = "ObsAr" # because of fixed printing of `array` in np printing - super(ObsAr, obj).__init__(*a, **kw) - return obj - - def __array_finalize__(self, obj): - # see InfoArray.__array_finalize__ for comments - if obj is None: return - self._observer_callables_ = getattr(obj, '_observer_callables_', None) - - def __array_wrap__(self, out_arr, context=None): - return out_arr.view(np.ndarray) - - def copy(self): - memo = {} - memo[id(self)] = self - return self.__deepcopy__(memo) - - def __deepcopy__(self, memo): - s = self.__new__(self.__class__, input_array=self.view(np.ndarray).copy()) - memo[id(self)] = s - import copy - s.__dict__.update(copy.deepcopy(self.__dict__, memo)) - return s - - def __reduce__(self): - func, args, state = super(ObsAr, self).__reduce__() - return func, args, (state, Pickleable.__getstate__(self)) - - def __setstate__(self, state): - np.ndarray.__setstate__(self, state[0]) - Pickleable.__setstate__(self, state[1]) - - def __setitem__(self, s, val): - super(ObsAr, self).__setitem__(s, val) - self.notify_observers() - - def __getslice__(self, start, stop): - return self.__getitem__(slice(start, stop)) - - def __setslice__(self, start, stop, val): - return self.__setitem__(slice(start, stop), val) - - def __ilshift__(self, *args, **kwargs): - r = np.ndarray.__ilshift__(self, *args, **kwargs) - self.notify_observers() - return r - - def __irshift__(self, *args, **kwargs): - r = np.ndarray.__irshift__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __ixor__(self, *args, **kwargs): - r = np.ndarray.__ixor__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __ipow__(self, *args, **kwargs): - r = np.ndarray.__ipow__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __ifloordiv__(self, *args, **kwargs): - r = np.ndarray.__ifloordiv__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __isub__(self, *args, **kwargs): - r = np.ndarray.__isub__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __ior__(self, *args, **kwargs): - r = np.ndarray.__ior__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __itruediv__(self, *args, **kwargs): - r = np.ndarray.__itruediv__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __idiv__(self, *args, **kwargs): - r = np.ndarray.__idiv__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __iand__(self, *args, **kwargs): - r = np.ndarray.__iand__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __imod__(self, *args, **kwargs): - r = np.ndarray.__imod__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __iadd__(self, *args, **kwargs): - r = np.ndarray.__iadd__(self, *args, **kwargs) - self.notify_observers() - return r - - - def __imul__(self, *args, **kwargs): - r = np.ndarray.__imul__(self, *args, **kwargs) - self.notify_observers() - return r \ No newline at end of file diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 60bdfe9d..182af902 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -4,7 +4,7 @@ import itertools import numpy from parameter_core import OptimizationHandlable, adjust_name_for_printing -from observable_array import ObsAr +from array_core import ObsAr ###### printing __constraints_name__ = "Constraint" diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index a04c16a5..ef99e9a6 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -21,4 +21,5 @@ except ImportError: if sympy_available: from _src.symbolic import Symbolic from _src.heat_eqinit import Heat_eqinit + from _src.ode1_eq_lfm import Ode1_eq_lfm diff --git a/GPy/kern/_src/symbolic.py b/GPy/kern/_src/symbolic.py index 79abd91d..91193c5d 100644 --- a/GPy/kern/_src/symbolic.py +++ b/GPy/kern/_src/symbolic.py @@ -10,7 +10,7 @@ except ImportError: import numpy as np from kern import Kern from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma -from GPy.util.functions import normcdf, normcdfln, logistic, logisticln +from GPy.util.functions import normcdf, normcdfln, logistic, logisticln, differfln from ...core.parameterization import Param class Symbolic(Kern): @@ -39,6 +39,7 @@ class Symbolic(Kern): 'erf':erf, 'erfc':erfc, 'erfcx':erfcx, 'polygamma':polygamma, + 'differfln':differfln, 'normcdf':normcdf, 'normcdfln':normcdfln, 'logistic':logistic, diff --git a/GPy/likelihoods/symbolic.py b/GPy/likelihoods/symbolic.py index 65c90183..370f4599 100644 --- a/GPy/likelihoods/symbolic.py +++ b/GPy/likelihoods/symbolic.py @@ -117,21 +117,30 @@ if sympy_available: # generate the code for the pdf and derivatives self._gen_code() + def list_functions(self): + """Return a list of all symbolic functions in the model and their names.""" def _gen_code(self): """Generate the code from the symbolic parts that will be used for likleihod computation.""" # TODO: Check here whether theano is available and set up # functions accordingly. - self._log_pdf_function = lambdify(self.arg_list, self._sym_log_pdf, self.func_modules) - - # compute code for derivatives - self._derivative_code = {key: lambdify(self.arg_list, self._log_pdf_derivatives[key], self.func_modules) for key in self._log_pdf_derivatives.keys()} - self._second_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_second_derivatives[key], self.func_modules) for key in self._log_pdf_second_derivatives.keys()} - self._third_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_third_derivatives[key], self.func_modules) for key in self._log_pdf_third_derivatives.keys()} - + symbolic_functions = [self._sym_log_pdf] + deriv_list = [self._log_pdf_derivatives, self._log_pdf_second_derivatives, self._log_pdf_third_derivatives] + symbolic_functions += [deriv[key] for key in sorted(deriv.keys()) for deriv in deriv_list] if self.missing_data: - self._missing_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_derivatives[key], self.func_modules) for key in self._missing_log_pdf_derivatives.keys()} - self._missing_second_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_second_derivatives[key], self.func_modules) for key in self._missing_log_pdf_second_derivatives.keys()} - self._missing_third_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_third_derivatives[key], self.func_modules) for key in self._missing_log_pdf_third_derivatives.keys()} + symbolic_functions+=[self._sym_missing_log_pdf] + deriv_list = [self._missing_log_pdf_derivatives, self._missing_log_pdf_second_derivatives, self._missing_log_pdf_third_derivatives] + symbolic_functions += [deriv[key] for key in sorted(deriv.keys()) for deriv in deriv_list] + # self._log_pdf_function = lambdify(self.arg_list, self._sym_log_pdf, self.func_modules) + + # # compute code for derivatives + # self._derivative_code = {key: lambdify(self.arg_list, self._log_pdf_derivatives[key], self.func_modules) for key in self._log_pdf_derivatives.keys()} + # self._second_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_second_derivatives[key], self.func_modules) for key in self._log_pdf_second_derivatives.keys()} + # self._third_derivative_code = {key: lambdify(self.arg_list, self._log_pdf_third_derivatives[key], self.func_modules) for key in self._log_pdf_third_derivatives.keys()} + + # if self.missing_data: + # self._missing_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_derivatives[key], self.func_modules) for key in self._missing_log_pdf_derivatives.keys()} + # self._missing_second_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_second_derivatives[key], self.func_modules) for key in self._missing_log_pdf_second_derivatives.keys()} + # self._missing_third_derivative_code = {key: lambdify(self.arg_list, self._missing_log_pdf_third_derivatives[key], self.func_modules) for key in self._missing_log_pdf_third_derivatives.keys()} # TODO: compute EP code parts based on logZ. We need dlogZ/dmu, d2logZ/dmu2 and dlogZ/dtheta diff --git a/GPy/util/functions.py b/GPy/util/functions.py index 1cb1f7cb..3278182f 100644 --- a/GPy/util/functions.py +++ b/GPy/util/functions.py @@ -19,3 +19,7 @@ def normcdfln(x): def clip_exp(x): return np.where(x-lim_val, np.exp(x), epsilon), 1/epsilon) + +def differfln(x0, x1): + # this is a, hopefully!, a numerically more stable variant of log(erf(x0)-erf(x1)) = log(erfc(x1)-erfc(x0)). + return np.where(x0>x1, -x1*x1 + np.log(erfcx(x1)-np.exp(-x0**2+x1**2)*erfcx(x0)), -x0*x0 + np.log(np.exp(-x1**2+x0**2)*erfcx(x1) - erfcx(x0))) diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py index 31aa1375..2cbff28f 100644 --- a/GPy/util/symbolic.py +++ b/GPy/util/symbolic.py @@ -1,10 +1,52 @@ import sympy as sym -from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma,polygamma - +from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma, polygamma +from sympy.utilities.lambdify import lambdastr +from sympy.utilities.iterables import numbered_symbols def stabilise(e): """Attempt to find the most numerically stable form of an expression""" return e #sym.expand(e) + +def gen_code(expressions, prefix='sub'): + """Generate code for the list of expressions provided using the common sub-expression eliminator.""" + # First convert the expressions to a list. + # We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'. + + expression_list = [] + expression_keys = [] + function_code = {} + for key in expressions.key(): + if key == 'function': + expression_list.append(expressions[key]) + expression_keys.append(key) + function_code[key] = '' + elif key[-9:] == 'derivative': + function_code[key] = {} + for dkey in expressions[key]: + expression_list.append(expressions[key][dkey]) + expression_keys.append([key, dkey]) + function_code[key][dkey] = '' + + symbols, functions = sym.cse(expression_list, numbered_symbols(prefix=prefix)) + + # Create strings that lambda strings from the expressions. + sub_expressions = [] + for sub, expr in symbols: + arg_list = [e for e in expr.atoms() if e.is_Symbol] + sub_code += [lambdastr(sorted(arg_list), expr)] + + function_expressions = [] + for expr, keys in zip(functions, expression_keys): + arg_list = [e for e in expr.atoms() if e.is_Symbol] + function_code += [lambdastr(sorted(arg_list), expr)] + + for key in function_code.key(): + if key == 'function': + function_code[key] = + + return sub_code, func_code + + class logistic(Function): """The logistic function as a symbolic function.""" nargs = 1 @@ -123,23 +165,23 @@ class normal(Function): def _eval_is_real(self): return True -class ln_diff_erf(Function): +class differfln(Function): nargs = 2 def fdiff(self, argindex=2): if argindex == 2: x0, x1 = self.args - return -2*exp(-x1**2)/(sqrt(pi)*(erf(x0)-erf(x1))) + return -2/(sqrt(pi)*(erfcx(x1)-exp(x1**2-x0**2)*erfcx(x0))) elif argindex == 1: x0, x1 = self.args - return 2.*exp(-x0**2)/(sqrt(pi)*(erf(x0)-erf(x1))) + return 2/(sqrt(pi)*(exp(x0**2-x1**2)*erfcx(x1)-erfcx(x0))) else: raise ArgumentIndexError(self, argindex) @classmethod def eval(cls, x0, x1): if x0.is_Number and x1.is_Number: - return log(erf(x0)-erf(x1)) + return log(erfc(x1)-erfc(x0)) class dh_dd_i(Function): nargs = 5 @@ -304,7 +346,6 @@ class h(Function): @classmethod def eval(cls, t, tprime, d_i, d_j, l): - # putting in the is_Number stuff forces it to look for a fdiff method for derivative. If it's left out, then when asking for self.diff, it just does the diff on the eval symbolic terms directly. We want to avoid that because we are looking to ensure everything is numerically stable. Maybe it's because of the if statement that this happens? if (t.is_Number and tprime.is_Number and d_i.is_Number From 9277695a6d97c88ec41f53d8590b2745109e6c8b Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Fri, 11 Apr 2014 16:35:41 +0100 Subject: [PATCH 3/7] More changes to symbolic --- GPy/core/mapping.py | 12 +++- GPy/core/parameterization/param.py | 2 +- GPy/util/symbolic.py | 103 +++++++++++++++++++++-------- 3 files changed, 87 insertions(+), 30 deletions(-) diff --git a/GPy/core/mapping.py b/GPy/core/mapping.py index efd9476f..8cfd9ad0 100644 --- a/GPy/core/mapping.py +++ b/GPy/core/mapping.py @@ -34,7 +34,7 @@ class Mapping(Parameterized): raise NotImplementedError def df_dtheta(self, dL_df, X): - """The gradient of the outputs of the multi-layer perceptron with respect to each of the parameters. + """The gradient of the outputs of the mapping with respect to each of the parameters. :param dL_df: gradient of the objective with respect to the function. :type dL_df: ndarray (num_data x output_dim) @@ -50,7 +50,7 @@ class Mapping(Parameterized): """ Plots the mapping associated with the model. - In one dimension, the function is plotted. - - In two dimsensions, a contour-plot shows the function + - In two dimensions, a contour-plot shows the function - In higher dimensions, we've not implemented this yet !TODO! Can plot only part of the data and part of the posterior functions @@ -65,6 +65,14 @@ class Mapping(Parameterized): else: raise NameError, "matplotlib package has not been imported." +class Bijective_mapping(Mapping): + """This is a mapping that is bijective, i.e. you can go from X to f and also back from f to X. The inverse mapping is called g().""" + def __init__(self, input_dim, output_dim, name='bijective_mapping'): + super(Bijective_apping, self).__init__(name=name) + + def g(self, f): + """Inverse mapping from output domain of the function to the inputs.""" + raise NotImplementedError from model import Model diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 182af902..60bdfe9d 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -4,7 +4,7 @@ import itertools import numpy from parameter_core import OptimizationHandlable, adjust_name_for_printing -from array_core import ObsAr +from observable_array import ObsAr ###### printing __constraints_name__ = "Constraint" diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py index 2cbff28f..5644fb76 100644 --- a/GPy/util/symbolic.py +++ b/GPy/util/symbolic.py @@ -7,45 +7,94 @@ def stabilise(e): return e #sym.expand(e) -def gen_code(expressions, prefix='sub'): - """Generate code for the list of expressions provided using the common sub-expression eliminator.""" +def gen_code(expressions, cache_prefix = 'cache', sub_prefix = 'sub', prefix='XoXoXoX', cacheable=[]): + """Generate code for the list of expressions provided using the common sub-expression eliminator to separate out portions that are computed multiple times.""" # First convert the expressions to a list. # We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'. + # Helper functions to get data in and out of dictionaries. + # this code from http://stackoverflow.com/questions/14692690/access-python-nested-dictionary-items-via-a-list-of-keys + def getFromDict(dataDict, mapList): + return reduce(lambda d, k: d[k], mapList, dataDict) + def setInDict(dataDict, mapList, value): + getFromDict(dataDict, mapList[:-1])[mapList[-1]] = value + + # This is the return dictionary that stores all the generated code. + code = {} expression_list = [] - expression_keys = [] - function_code = {} - for key in expressions.key(): + key_list = [] + order_list = [] + code['main'] = {} + for key in expressions.keys(): if key == 'function': - expression_list.append(expressions[key]) - expression_keys.append(key) - function_code[key] = '' - elif key[-9:] == 'derivative': - function_code[key] = {} - for dkey in expressions[key]: + expression_list.append(expressions[key]) + key_list.append([key]) + order_list.append(1) + code['main'][key] = '' + elif key[-10:] == 'derivative': + code['main'][key] = {} + for dkey in expressions[key].keys(): expression_list.append(expressions[key][dkey]) - expression_keys.append([key, dkey]) - function_code[key][dkey] = '' + key_list.append([key, dkey]) + if key[:-10] == 'first' or key[:-10] == '': + order_list.append(3) #sym.count_ops(expressions[key][dkey])) + elif key[:-10] == 'second': + order_list.append(4) #sym.count_ops(expressions[key][dkey])) + elif key[:-10] == 'third': + order_list.append(5) #sym.count_ops(expressions[key][dkey])) + code['main'][key][dkey] = '' + else: + expression_list.append(expressions[key]) + key_list.append([key]) + order_list.append(2) + code['main'][key] = '' - symbols, functions = sym.cse(expression_list, numbered_symbols(prefix=prefix)) - + # This step may be unecessary. + # Not 100% sure if the sub expression elimination is order sensitive. This step orders the list with the 'function' code first and derivatives after. + order_list, expression_list, key_list = zip(*sorted(zip(order_list, expression_list, key_list))) + + print expression_list + + subexpressions, expression_substituted_list = sym.cse(expression_list, numbered_symbols(prefix=prefix)) + cacheable_list = [] # Create strings that lambda strings from the expressions. - sub_expressions = [] - for sub, expr in symbols: + code['params_change'] = [] + code['cache'] = [] + for expr in subexpressions: + arg_list = [e for e in expr[1].atoms() if e.is_Symbol] + cacheable_symbols = [e for e in arg_list if e in cacheable_list or e in cacheable] + if cacheable_symbols: + code['cacheable'].append((expr[0],expr2code(arg_list, expr[1]))) + # list which ensures dependencies are cacheable. + cacheable_list.append(expr[0]) + code['cacheexpressions'].append(expr[0]) + else: + code['params_change'].append((expr[0],expr2code(arg_list, expr[1]))) + code['subexpressions'].append(expr[0]) + + for expr, keys in zip(expression_substituted_list, key_list): arg_list = [e for e in expr.atoms() if e.is_Symbol] - sub_code += [lambdastr(sorted(arg_list), expr)] + setInDict(code['main'], keys, expr2code(arg_list, expr)) + setInDict(expressions, keys, expr) - function_expressions = [] - for expr, keys in zip(functions, expression_keys): - arg_list = [e for e in expr.atoms() if e.is_Symbol] - function_code += [lambdastr(sorted(arg_list), expr)] + sub_dict = {} + for i, sub in enumerate(code['cacheexpressions']): + sub_dict[sub.name] = cache_prefix + str(i) + for i, sub in enumerate(code['subexpressions']): + sub_dict[sub.name] = sub_prefix + str(i) - for key in function_code.key(): - if key == 'function': - function_code[key] = - - return sub_code, func_code + + return code + +def expr2code(arg_list, expr): + """Convert the given symbolic expression into code.""" + code = lambdastr(arg_list, expr) + function_code = code.split(':')[1] + for arg in arg_list: + function_code = function_code.replace(arg.name, 'self.'+arg.name) + + return function_code class logistic(Function): """The logistic function as a symbolic function.""" From c17791a12c3263b8b3d4297036cb840cc1c823d3 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Wed, 16 Apr 2014 15:37:50 +0100 Subject: [PATCH 4/7] [datasets] merged hapmap dataset into params --- .../latent_function_inference/var_dtc.py | 11 +- GPy/util/data_resources.json | 410 +----------------- GPy/util/datasets.py | 170 +++++++- GPy/util/datasets/data_resources_create.py | 59 ++- 4 files changed, 227 insertions(+), 423 deletions(-) diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py index 7344b204..0cc841ed 100644 --- a/GPy/inference/latent_function_inference/var_dtc.py +++ b/GPy/inference/latent_function_inference/var_dtc.py @@ -192,17 +192,22 @@ class VarDTC(object): class VarDTCMissingData(object): const_jitter = 1e-6 - def __init__(self, limit=1): + def __init__(self, limit=1, inan=None): from ...util.caching import Cacher self._Y = Cacher(self._subarray_computations, limit) + self._inan = inan pass def set_limit(self, limit): self._Y.limit = limit def _subarray_computations(self, Y): - inan = np.isnan(Y) - has_none = inan.any() + if self._inan is None: + inan = np.isnan(Y) + has_none = inan.any() + else: + inan = self._inan + has_none = True if has_none: from ...util.subarray_and_sorting import common_subarrays self._subarray_indices = [] diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 57b79f10..845d56be 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -1,409 +1 @@ -{ - "rogers_girolami_data":{ - "files":[ - [ - "firstcoursemldata.tar.gz" - ] - ], - "license":null, - "citation":"A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", - "details":"Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", - "urls":[ - "https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/" - ], - "suffices":[ - [ - "?dl=1" - ] - ], - "size":21949154 - }, - "ankur_pose_data":{ - "files":[ - [ - "ankurDataPoseSilhouette.mat" - ] - ], - "citation":"3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", - "license":null, - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/" - ], - "details":"Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.", - "size":1 - }, - "football_data":{ - "files":[ - [ - "E0.csv", "E1.csv", "E2.csv", "E3.csv" - ] - ], - "citation":"", - "license":null, - "urls":[ - "http://www.football-data.co.uk/mmz4281/" - ], - "details":"Results of English football matches since 1993/94 season.", - "size":1 - }, - "google_trends":{ - "files":[ - [ - ] - ], - "citation":"", - "license":null, - "urls":[ - "http://www.google.com/trends/" - ], - "details":"Google trends results.", - "size":0 - }, - "osu_accad":{ - "files":[ - [ - "swagger1TXT.ZIP", - "handspring1TXT.ZIP", - "quickwalkTXT.ZIP", - "run1TXT.ZIP", - "sprintTXT.ZIP", - "dogwalkTXT.ZIP", - "camper_04TXT.ZIP", - "dance_KB3_TXT.ZIP", - "per20_TXT.ZIP", - "perTWO07_TXT.ZIP", - "perTWO13_TXT.ZIP", - "perTWO14_TXT.ZIP", - "perTWO15_TXT.ZIP", - "perTWO16_TXT.ZIP" - ], - [ - "connections.txt" - ] - ], - "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", - "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", - "details":"Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", - "urls":[ - "http://accad.osu.edu/research/mocap/data/", - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" - ], - "size":15922790 - }, - "isomap_face_data":{ - "files":[ - [ - "face_data.mat" - ] - ], - "license":null, - "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", - "details":"Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/" - ], - "size":24229368 - }, - "boston_housing":{ - "files":[ - [ - "Index", - "housing.data", - "housing.names" - ] - ], - "license":null, - "citation":"Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", - "details":"The Boston Housing data relates house values in Boston to a range of input variables.", - "urls":[ - "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/" - ], - "size":51276 - }, - "cmu_mocap_full":{ - "files":[ - [ - "allasfamc.zip" - ] - ], - "license":"From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", - "citation":"Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.", - "details":"CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", - "urls":[ - "http://mocap.cs.cmu.edu/subjects" - ], - "size":null - }, - "brendan_faces":{ - "files":[ - [ - "frey_rawface.mat" - ] - ], - "license":null, - "citation":"Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", - "details":"A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", - "urls":[ - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":1100584 - }, - "olympic_marathon_men":{ - "files":[ - [ - "olympicMarathonTimes.csv" - ] - ], - "license":null, - "citation":null, - "details":"Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/" - ], - "size":584 - }, - "pumadyn-32nm":{ - "files":[ - [ - "pumadyn-32nm.tar.gz" - ] - ], - "license":"Data is made available by the Delve system at the University of Toronto", - "citation":"Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", - "details":"Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", - "urls":[ - "ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/" - ], - "size":5861646 - }, - "ripley_prnn_data":{ - "files":[ - [ - "Cushings.dat", - "README", - "crabs.dat", - "fglass.dat", - "fglass.grp", - "pima.te", - "pima.tr", - "pima.tr2", - "synth.te", - "synth.tr", - "viruses.dat", - "virus3.dat" - ] - ], - "license":null, - "citation":"Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", - "details":"Data sets from Brian Ripley's Pattern Recognition and Neural Networks", - "urls":[ - "http://www.stats.ox.ac.uk/pub/PRNN/" - ], - "size":93565 - }, - "three_phase_oil_flow":{ - "files":[ - [ - "DataTrnLbls.txt", - "DataTrn.txt", - "DataTst.txt", - "DataTstLbls.txt", - "DataVdn.txt", - "DataVdnLbls.txt" - ] - ], - "license":null, - "citation":"Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", - "details":"The three phase oil data used initially for demonstrating the Generative Topographic mapping.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/" - ], - "size":712796 - }, - "robot_wireless":{ - "files":[ - [ - "uw-floor.txt" - ] - ], - "license":null, - "citation":"WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", - "details":"Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/" - ], - "size":284390 - }, - "xw_pen":{ - "files":[ - [ - "xw_pen_15.csv" - ] - ], - "license":null, - "citation":"Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", - "details":"Accelerometer pen data used for robust regression by Tipping and Lawrence.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/" - ], - "size":3410 - }, - "swiss_roll":{ - "files":[ - [ - "swiss_roll_data.mat" - ] - ], - "license":null, - "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", - "details":"Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", - "urls":[ - "http://isomap.stanford.edu/" - ], - "size":800256 - }, - "osu_run1":{ - "files":[ - [ - "run1TXT.ZIP" - ], - [ - "connections.txt" - ] - ], - "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", - "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", - "details":"Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", - "urls":[ - "http://accad.osu.edu/research/mocap/data/", - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" - ], - "size":338103 - }, - "creep_rupture":{ - "files":[ - [ - "creeprupt.tar" - ] - ], - "license":null, - "citation":"Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", - "details":"Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", - "urls":[ - "http://www.msm.cam.ac.uk/map/data/tar/" - ], - "size":602797 - }, - "olivetti_faces":{ - "files":[ - [ - "att_faces.zip" - ], - [ - "olivettifaces.mat" - ] - ], - "license":null, - "citation":"Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", - "details":"Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":8561331 - }, - "olivetti_glasses":{ - "files":[ - [ - "has_glasses.np" - ], - [ - "olivettifaces.mat" - ] - ], - "license":null, - "citation":"Information recorded in olivetti_faces entry. Should be used from there.", - "details":"Information recorded in olivetti_faces entry. Should be used from there.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":4261047 - }, - "della_gatta":{ - "files":[ - [ - "DellaGattadata.mat" - ] - ], - "license":null, - "citation":"Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", - "details":"The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/" - ], - "size":3729650 - }, - "epomeo_gpx":{ - "files":[ - [ - "endomondo_1.gpx", - "endomondo_2.gpx", - "garmin_watch_via_endomondo.gpx", - "viewranger_phone.gpx", - "viewranger_tablet.gpx" - ] - ], - "license":null, - "citation":"", - "details":"Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/" - ], - "size":2031872 - }, - "mauna_loa":{ - "files":[ - [ - "co2_mm_mlo.txt" - ] - ], - "license":"-------------------------------------------------------------------- USE OF NOAA ESRL DATA\n\n These data are made freely available to the public and the scientific community in the belief that their wide dissemination will lead to greater understanding and new scientific insights. The availability of these data does not constitute publication of the data. NOAA relies on the ethics and integrity of the user to insure that ESRL receives fair credit for their work. If the data are obtained for potential use in a publication or presentation, ESRL should be informed at the outset of the nature of this work. If the ESRL data are essential to the work, or if an important result or conclusion depends on the ESRL data, co-authorship may be appropriate. This should be discussed at an early stage in the work. Manuscripts using the ESRL data should be sent to ESRL for review before they are submitted for publication so we can insure that the quality and limitations of the data are accurately represented.\n\n Contact: Pieter Tans (303 497 6678; pieter.tans@noaa.gov)\n\n RECIPROCITY Use of these data implies an agreement to reciprocate. Laboratories making similar measurements agree to make their own data available to the general public and to the scientific community in an equally complete and easily accessible form. Modelers are encouraged to make available to the community, upon request, their own tools used in the interpretation of the ESRL data, namely well documented model code, transport fields, and additional information necessary for other scientists to repeat the work and to run modified versions. Model availability includes collaborative support for new users of the models.\n --------------------------------------------------------------------\n\n See www.esrl.noaa.gov/gmd/ccgg/trends/ for additional details.", - "citation":"Mauna Loa Data. Dr. Pieter Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/) and Dr. Ralph Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/).", - "details":"The 'average' column contains the monthly mean CO2 mole fraction determined from daily averages. The mole fraction of CO2, expressed as parts per million (ppm) is the number of molecules of CO2 in every one million molecules of dried air (water vapor removed). If there are missing days concentrated either early or late in the month, the monthly mean is corrected to the middle of the month using the average seasonal cycle. Missing months are denoted by -99.99. The 'interpolated' column includes average values from the preceding column and interpolated values where data are missing. Interpolated values are computed in two steps. First, we compute for each month the average seasonal cycle in a 7-year window around each monthly value. In this way the seasonal cycle is allowed to change slowly over time. We then determine the 'trend' value for each month by removing the seasonal cycle; this result is shown in the 'trend' column. Trend values are linearly interpolated for missing months. The interpolated monthly mean is then the sum of the average seasonal cycle value and the trend value for the missing month.\n\nNOTE: In general, the data presented for the last year are subject to change, depending on recalibration of the reference gas mixtures used, and other quality control procedures. Occasionally, earlier years may also be changed for the same reasons. Usually these changes are minor.\n\nCO2 expressed as a mole fraction in dry air, micromol/mol, abbreviated as ppm \n\n (-99.99 missing data; -1 no data for daily means in month)", - "urls":[ - "ftp://aftp.cmdl.noaa.gov/products/trends/co2/" - ], - "size":46779 - }, - "boxjenkins_airline":{ - "files":[ - [ - "boxjenkins_airline.csv" - ] - ], - "license":"You may copy and redistribute the data. You may make derivative works from the data. You may use the data for commercial purposes. You may not sublicence the data when redistributing it. You may not redistribute the data under a different license. Source attribution on any use of this data: Must refer source.", - "citation":"Box & Jenkins (1976), in file: data/airpass, Description: International airline passengers: monthly totals in thousands. Jan 49 – Dec 60", - "details":"International airline passengers, monthly totals from January 1949 to December 1960.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/boxjenkins_airline/" - ], - "size":46779 - }, - - "decampos_characters":{ - "files":[ - [ - "characters.npy", - "digits.npy" - ] - ], - "license":null, - "citation":"T. de Campos, B. R. Babu, and M. Varma. Character recognition in natural images. VISAPP 2009.", - "details":"Examples of hand written digits taken from the de Campos et al paper on Character Recognition in Natural Images.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/decampos_digits/" - ], - "size":2031872 - } -} +{"rogers_girolami_data": {"files": [["firstcoursemldata.tar.gz"]], "license": null, "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", "urls": ["https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/"], "suffices": [["?dl=1"]], "size": 21949154}, "ankur_pose_data": {"files": [["ankurDataPoseSilhouette.mat"]], "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", "license": null, "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/"], "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."}, "osu_accad": {"files": [["swagger1TXT.ZIP", "handspring1TXT.ZIP", "quickwalkTXT.ZIP", "run1TXT.ZIP", "sprintTXT.ZIP", "dogwalkTXT.ZIP", "camper_04TXT.ZIP", "dance_KB3_TXT.ZIP", "per20_TXT.ZIP", "perTWO07_TXT.ZIP", "perTWO13_TXT.ZIP", "perTWO14_TXT.ZIP", "perTWO15_TXT.ZIP", "perTWO16_TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 15922790}, "isomap_face_data": {"files": [["face_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/"], "size": 24229368}, "boston_housing": {"files": [["Index", "housing.data", "housing.names"]], "license": null, "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", "details": "The Boston Housing data relates house values in Boston to a range of input variables.", "urls": ["http://archive.ics.uci.edu/ml/machine-learning-databases/housing/"], "size": 51276}, "cmu_mocap_full": {"files": [["allasfamc.zip"]], "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'\n 'The database was created with funding from NSF EIA-0196217.", "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", "urls": ["http://mocap.cs.cmu.edu"], "size": null}, "brendan_faces": {"files": [["frey_rawface.mat"]], "license": null, "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", "urls": ["http://www.cs.nyu.edu/~roweis/data/"], "size": 1100584}, "olympic_marathon_men": {"files": [["olympicMarathonTimes.csv"]], "license": null, "citation": null, "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/"], "size": 584}, "pumadyn-32nm": {"files": [["pumadyn-32nm.tar.gz"]], "license": "Data is made available by the Delve system at the University of Toronto", "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", "urls": ["ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/"], "size": 5861646}, "ripley_prnn_data": {"files": [["Cushings.dat", "README", "crabs.dat", "fglass.dat", "fglass.grp", "pima.te", "pima.tr", "pima.tr2", "synth.te", "synth.tr", "viruses.dat", "virus3.dat"]], "license": null, "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", "urls": ["http://www.stats.ox.ac.uk/pub/PRNN/"], "size": 93565}, "three_phase_oil_flow": {"files": [["DataTrnLbls.txt", "DataTrn.txt", "DataTst.txt", "DataTstLbls.txt", "DataVdn.txt", "DataVdnLbls.txt"]], "license": null, "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/"], "size": 712796}, "robot_wireless": {"files": [["uw-floor.txt"]], "license": null, "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/"], "size": 284390}, "xw_pen": {"files": [["xw_pen_15.csv"]], "license": null, "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/"], "size": 3410}, "swiss_roll": {"files": [["swiss_roll_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://isomap.stanford.edu/"], "size": 800256}, "osu_run1": {"files": [["run1TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 338103}, "creep_rupture": {"files": [["creeprupt.tar"]], "license": null, "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", "urls": ["http://www.msm.cam.ac.uk/map/data/tar/"], "size": 602797}, "hapmap3": {"files": [["hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", "relationships_w_pops_121708.txt"]], "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", "citation": "Gibbs, Richard A., et al. \"The international HapMap project.\" Nature 426.6968 (2003): 789-796.", "details": "HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.", "urls": ["http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/"], "size": 3458246739}, "olivetti_faces": {"files": [["att_faces.zip"], ["olivettifaces.mat"]], "license": null, "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", "http://www.cs.nyu.edu/~roweis/data/"], "size": 8561331}, "della_gatta": {"files": [["DellaGattadata.mat"]], "license": null, "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/"], "size": 3729650}, "epomeo_gpx": {"files": [["endomondo_1.gpx", "endomondo_2.gpx", "garmin_watch_via_endomondo.gpx", "viewranger_phone.gpx", "viewranger_tablet.gpx"]], "license": null, "citation": "", "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/"], "size": 2031872}} \ No newline at end of file diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index f53163f4..748a5839 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -106,9 +106,30 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code)) elif response.code > 499: raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code)) - # if we wanted to get more sophisticated maybe we should check the response code here again even for successes. with open(save_name, 'wb') as f: - f.write(response.read()) + meta = response.info() + file_size = int(meta.getheaders("Content-Length")[0]) + status = "" + file_size_dl = 0 + block_sz = 8192 + line_length=30 + while True: + buff = response.read(block_sz) + if not buff: + break + file_size_dl += len(buff) + f.write(buff) + sys.stdout.write(" "*(len(status)) + "\r") + status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), + full=file_size/(1.*1e6), ll=line_length, + perc="="*int(line_length*float(file_size_dl)/file_size)) + sys.stdout.write(status) + sys.stdout.flush() + sys.stdout.write(" "*(len(status)) + "\r") + print status + # if we wanted to get more sophisticated maybe we should check the response code here again even for successes. + #with open(save_name, 'wb') as f: + # f.write(response.read()) #urllib.urlretrieve(url+suffix, save_name, reporthook) @@ -552,6 +573,151 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0): c = c[so, :] return {'Y':Y, 't':t, 'colors':c} +def hapmap3(data_set='hapmap3'): + """ + The HapMap phase three SNP dataset - 1184 samples out of 11 populations. + + SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: + Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then + + / 1, iff SNPij==(B1,B1) + Aij = | 0, iff SNPij==(B1,B2) + \ -1, iff SNPij==(B2,B2) + + The SNP data and the meta information (such as iid, sex and phenotype) are + stored in the dataframe datadf, index is the Individual ID, + with following columns for metainfo: + + * family_id -> Family ID + * paternal_id -> Paternal ID + * maternal_id -> Maternal ID + * sex -> Sex (1=male; 2=female; other=unknown) + * phenotype -> Phenotype (-9, or 0 for unknown) + * population -> Population string (e.g. 'ASW' - 'YRI') + * rest are SNP rs (ids) + + More information is given in infodf: + + * Chromosome: + - autosomal chromosemes -> 1-22 + - X X chromosome -> 23 + - Y Y chromosome -> 24 + - XY Pseudo-autosomal region of X -> 25 + - MT Mitochondrial -> 26 + * Relative Positon (to Chromosome) [base pairs] + """ + try: + from pandas import read_pickle, DataFrame + from sys import stdout + import bz2 + except ImportError as i: + raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset" + if not data_available(data_set): + download_data(data_set) + dirpath = os.path.join(data_path,'hapmap3') + hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly' + preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \ + ['.snps.pickle', + '.info.pickle', + '.nan.pickle']] + if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)): + if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB " + "of memory and can take a (very) long time, continue? [Y/n]"): + print "Preprocessing required for further usage." + return + status = "Preprocessing data, please be patient..." + print status + def write_status(message, progress, status): + stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush() + status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20, + perc="="*int(20.*progress/100.)) + stdout.write(status); stdout.flush() + return status + unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']] + if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)): + status=write_status('unpacking...', 0, '') + curr = 0 + for newfilepath in unpacked_files: + if not os.path.exists(newfilepath): + filepath = newfilepath + '.bz2' + file_size = os.path.getsize(filepath) + with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f: + decomp = bz2.BZ2Decompressor() + file_processed = 0 + buffsize = 100 * 1024 + for data in iter(lambda : f.read(buffsize), b''): + new_file.write(decomp.decompress(data)) + file_processed += len(data) + status=write_status('unpacking...', curr+12.*file_processed/(file_size), status) + curr += 12 + status=write_status('unpacking...', curr, status) + status=write_status('reading .ped...', 25, status) + # Preprocess data: + snpstrnp = np.loadtxt(unpacked_files[0], dtype=str) + status=write_status('reading .map...', 33, status) + mapnp = np.loadtxt(unpacked_files[1], dtype=str) + status=write_status('reading relationships.txt...', 42, status) + # and metainfo: + infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t') + infodf.set_index('IID', inplace=1) + status=write_status('filtering nan...', 45, status) + snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2) + inan = snpstr[:,:,0] == '0' + status=write_status('filtering reference alleles...', 55, status) + ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:])) + status=write_status('encoding snps...', 70, status) + # Encode the information for each gene in {-1,0,1}: + status=write_status('encoding snps...', 73, status) + snps = (snpstr==ref[None,:,:]) + status=write_status('encoding snps...', 76, status) + snps = (snps*np.array([1,-1])[None,None,:]) + status=write_status('encoding snps...', 78, status) + snps = snps.sum(-1) + status=write_status('encoding snps...', 81, status) + snps = snps.astype('i8') + status=write_status('marking nan values...', 88, status) + # put in nan values (masked as -128): + snps[inan] = -128 + status=write_status('setting up meta...', 94, status) + # get meta information: + metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']] + metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6]) + metadf.set_index('iid', inplace=1) + metadf = metadf.join(infodf.population) + metadf.to_pickle(preprocessed_data_paths[1]) + # put everything together: + status=write_status('setting up snps...', 96, status) + snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1]) + with open(preprocessed_data_paths[0], 'wb') as f: + pickle.dump(f, snpsdf, protocoll=-1) + status=write_status('setting up snps...', 98, status) + inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1]) + inandf.to_pickle(preprocessed_data_paths[2]) + status=write_status('done :)', 100, status) + print '' + else: + print "loading snps..." + snpsdf = read_pickle(preprocessed_data_paths[0]) + print "loading metainfo..." + metadf = read_pickle(preprocessed_data_paths[1]) + print "loading nan entries..." + inandf = read_pickle(preprocessed_data_paths[2]) + snps = snpsdf.values + populations = metadf.population.values.astype('S3') + hapmap = dict(name=data_set, + description='The HapMap phase three SNP dataset - ' + '1184 samples out of 11 populations. inan is a ' + 'boolean array, containing wheather or not the ' + 'given entry is nan (nans are masked as ' + '-128 in snps).', + snpsdf=snpsdf, + metadf=metadf, + snps=snps, + inan=inandf.values, + inandf=inandf, + populations=populations) + return hapmap + def swiss_roll_1000(): return swiss_roll(num_samples=1000) diff --git a/GPy/util/datasets/data_resources_create.py b/GPy/util/datasets/data_resources_create.py index 8ae62a85..4e7c3524 100644 --- a/GPy/util/datasets/data_resources_create.py +++ b/GPy/util/datasets/data_resources_create.py @@ -24,12 +24,12 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'], 'license': None, 'size' : 1100584}, 'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'], - 'files' : [['allasfamc.zip']], - 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu. -The database was created with funding from NSF EIA-0196217.""", - 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""", - 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""", - 'size' : None}, + 'files' : [['allasfamc.zip']], + 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.' + 'The database was created with funding from NSF EIA-0196217.""", + 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""", + 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""", + 'size' : None}, 'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'], 'files' : [['creeprupt.tar']], 'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.', @@ -120,8 +120,49 @@ The database was created with funding from NSF EIA-0196217.""", 'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""", 'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005', 'license' : None, - 'size' : 3410} + 'size' : 3410}, + 'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'], + 'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']], + 'details' : """ + HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. + The HapMap phase three SNP dataset - 1184 samples out of 11 populations. + See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details. + + SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: + Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then + + / 1, iff SNPij==(B1,B1) + Aij = | 0, iff SNPij==(B1,B2) + \ -1, iff SNPij==(B2,B2) + + The SNP data and the meta information (such as iid, sex and phenotype) are + stored in the dataframe datadf, index is the Individual ID, + with following columns for metainfo: + + * family_id -> Family ID + * paternal_id -> Paternal ID + * maternal_id -> Maternal ID + * sex -> Sex (1=male; 2=female; other=unknown) + * phenotype -> Phenotype (-9, or 0 for unknown) + * population -> Population string (e.g. 'ASW' - 'YRI') + * rest are SNP rs (ids) + + More information is given in infodf: + + * Chromosome: + - autosomal chromosemes -> 1-22 + - X X chromosome -> 23 + - Y Y chromosome -> 24 + - XY Pseudo-autosomal region of X -> 25 + - MT Mitochondrial -> 26 + * Relative Positon (to Chromosome) [base pairs] + + """, + 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""", + 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""", + 'size' : 2*1729092237 + 62265}, } -with open('data_resources.json', 'w') as file: - json.dump(data_resources, file) +with open('data_resources.json', 'w') as f: + print "writing data_resources" + json.dump(data_resources, f) From 4fc8d6117f0c7c4e0f95fbba19929af5a6b95f78 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 17 Apr 2014 09:19:24 +0100 Subject: [PATCH 5/7] removed random.seed from the base of kernel_tests.py (the tests still pass) --- GPy/testing/kernel_tests.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py index 65998ad2..e5089626 100644 --- a/GPy/testing/kernel_tests.py +++ b/GPy/testing/kernel_tests.py @@ -9,8 +9,6 @@ from GPy.core.parameterization.param import Param verbose = 0 -np.random.seed(50) - class Kern_check_model(GPy.core.Model): """ From 583f3bef0a1f734ade1fd327072ed1bba4a6e9fd Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Thu, 17 Apr 2014 07:05:16 -0400 Subject: [PATCH 6/7] First draft of base symbolic object, compiling with symbolic mapping. --- GPy/core/symbolic.py | 349 ++++++++++++++++++++++++++++++++++++ GPy/kern/_src/symbolic.py | 2 +- GPy/likelihoods/symbolic.py | 1 - GPy/mappings/symbolic.py | 56 ++++++ GPy/util/datasets.py | 4 +- GPy/util/symbolic.py | 97 +--------- 6 files changed, 410 insertions(+), 99 deletions(-) create mode 100644 GPy/core/symbolic.py create mode 100644 GPy/mappings/symbolic.py diff --git a/GPy/core/symbolic.py b/GPy/core/symbolic.py new file mode 100644 index 00000000..9074d481 --- /dev/null +++ b/GPy/core/symbolic.py @@ -0,0 +1,349 @@ +# Copyright (c) 2014, GPy authors (see AUTHORS.txt). +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import sys +from ..core.parameterization import Parameterized +import numpy as np +import sympy as sym +from ..core.parameterization import Param +from sympy.utilities.lambdify import lambdastr, _imp_namespace, _get_namespace +from sympy.utilities.iterables import numbered_symbols +from sympy import exp +from scipy.special import gammaln, gamma, erf, erfc, erfcx, polygamma +from GPy.util.functions import normcdf, normcdfln, logistic, logisticln + +class Symbolic_core(): + """ + Base model symbolic class. + """ + + def __init__(self, expression, cacheable, derivatives=None, param=None, func_modules=[]): + # Base class init, do some basic derivatives etc. + + # Func_modules sets up the right mapping for functions. + self.func_modules = func_modules + self.func_modules += [{'gamma':gamma, + 'gammaln':gammaln, + 'erf':erf, 'erfc':erfc, + 'erfcx':erfcx, + 'polygamma':polygamma, + 'normcdf':normcdf, + 'normcdfln':normcdfln, + 'logistic':logistic, + 'logisticln':logisticln}, + 'numpy'] + + self.expressions = {} + self.expressions['function'] = expression + self.cacheable = cacheable + + # pull the parameters and inputs out of the symbolic pdf + self.variables = {} + vars = [e for e in expression.atoms() if e.is_Symbol] + + # inputs are assumed to be those things that are + # cacheable. I.e. those things that aren't stored within the + # object except as cached. For covariance functions this is X + # and Z, for likelihoods F and for mapping functions X. + self.cacheable_vars = [] # list of everything that's cacheable + for var in cacheable: + self.variables[var] = [e for e in vars if e.name.split('_')[0]==var.lower()] + self.cacheable_vars += self.variables[var] + for var in cacheable: + if not self.variables[var]: + raise ValueError('Variable ' + var + ' was specified as cacheable but is not in expression. Expected to find symbols of the form ' + var.lower() + '_0 to represent ' + var) + + # things that aren't cacheable are assumed to be parameters. + self.variables['theta'] = sorted([e for e in vars if not e in self.cacheable_vars],key=lambda e:e.name) + + # these are arguments for computing derivatives. + derivative_arguments = [] + if derivatives is not None: + for derivative in derivatives: + derivative_arguments += self.variables[derivative] + + # Do symbolic work to compute derivatives. + self.expressions['derivative'] = {theta.name : sym.diff(self.expressions['function'],theta) for theta in derivative_arguments} + # Add parameters to the model. + for theta in self.variables['theta']: + val = 1.0 + # TODO: need to decide how to handle user passing values for the se parameter vectors. + if param is not None: + if param.has_key(theta.name): + val = param[theta.name] + # Add parameter. + setattr(self, theta.name, Param(theta.name, val, None)) + self.add_parameters(getattr(self, theta.name)) + + self.namespace = [globals(), self.__dict__] + self._gen_code() + + def eval_parameters_changed(self): + # TODO: place checks for inf/nan in here + # do all the precomputation codes. + for variable, code in sorted(self.code['params_change'].iteritems()): + setattr(self, variable, eval(code, *self.namespace)) + self.eval_update_cache() + + def eval_update_cache(self, X=None): + # TODO: place checks for inf/nan in here + if X is not None: + for i, theta in enumerate(self.variables['X']): + setattr(self, theta.name, X[:, i][:, None]) + + for variable, code in sorted(self.code['update_cache'].iteritems()): + setattr(self, variable, eval(code, *self.namespace)) + + def eval_update_gradients(self, partial, X): + # TODO: place checks for inf/nan in here + for theta in self.variables['theta']: + code = self.code['derivative'][theta.name] + setattr(getattr(self, theta.name), + 'gradient', + (partial*eval(code, *self.namespace)).sum()) + + def eval_gradients_X(self, partial, X): + gradients_X = np.zeros_like(X) + self.eval_update_cache(X) + for i, theta in enumerate(self.variables['X']): + code = self.code['derivative'][theta.name] + gradients_X[:, i:i+1] = partial*eval(code, *self.namespace) + return gradients_X + + def eval_f(self, X): + self.eval_update_cache(X) + return eval(self.code['function'], *self.namespace) + + def code_parameters_changed(self): + # do all the precomputation codes. + lcode = '' + for variable, code in sorted(self.code['params_change'].iteritems()): + lcode += variable + ' = ' + self._print_code(code) + '\n' + return lcode + + def code_update_cache(self): + lcode = 'if X is not None:\n' + for i, theta in enumerate(self.variables['X']): + lcode+= "\t" + self._print_code(theta.name) + ' = X[:, ' + str(i) + "][:, None]\n" + + for variable, code in sorted(self.code['update_cache'].iteritems()): + lcode+= self._print_code(variable) + ' = ' + self._print_code(code) + "\n" + + return lcode + + def code_update_gradients(self): + lcode = '' + for theta in self.variables['theta']: + code = self.code['derivative'][theta.name] + lcode += self._print_code(theta.name) + '.gradient = (partial*(' + self._print_code(code) + ')).sum()\n' + return lcode + + def code_gradients_X(self): + lcode = 'gradients_X = np.zeros_like(X)\n' + lcode += 'self.update_cache(X)\n' + for i, theta in enumerate(self.variables['X']): + code = self.code['derivative'][theta.name] + lcode += 'gradients_X[:, ' + str(i) + ':' + str(i) + '+1] = partial*' + self._print_code(code) + '\n' + lcode += 'return gradients_X\n' + return lcode + + def code_f(self): + lcode = 'self.update_cache(X)\n' + lcode += 'return ' + self._print_code(self.code['function']) + return lcode + + def stabilise(self): + """Stabilize the code in the model.""" + # this code is applied to all expressions in the model in an attempt to sabilize them. + pass + + def _gen_namespace(self, modules=None, use_imps=True): + """Gets the relevant namespaces for the given expressions.""" + from sympy.core.symbol import Symbol + + # If the user hasn't specified any modules, use what is available. + module_provided = True + if modules is None: + module_provided = False + # Use either numpy (if available) or python.math where possible. + # XXX: This leads to different behaviour on different systems and + # might be the reason for irreproducible errors. + modules = ["math", "mpmath", "sympy"] + + try: + _import("numpy") + except ImportError: + pass + else: + modules.insert(1, "numpy") + + + # Get the needed namespaces. + namespaces = [] + # First find any function implementations + if use_imps: + for expr in self._expression_list: + namespaces.append(_imp_namespace(expr)) + # Check for dict before iterating + if isinstance(modules, (dict, str)) or not hasattr(modules, '__iter__'): + namespaces.append(modules) + else: + namespaces += list(modules) + # fill namespace with first having highest priority + namespace = {} + for m in namespaces[::-1]: + buf = _get_namespace(m) + namespace.update(buf) + for expr in self._expression_list: + if hasattr(expr, "atoms"): + #Try if you can extract symbols from the expression. + #Move on if expr.atoms in not implemented. + syms = expr.atoms(Symbol) + for term in syms: + namespace.update({str(term): term}) + + + return namespace + def update_expression_list(self): + """Extract a list of expressions from the dictionary of expressions.""" + self.expression_list = [] # code arrives in dictionary, but is passed in this list + self.expression_keys = [] # Keep track of the dictionary keys. + self.expression_order = [] # This may be unecessary. It's to give ordering for cse + for key in self.expressions.keys(): + if key == 'function': + self.expression_list.append(self.expressions[key]) + self.expression_keys.append([key]) + self.expression_order.append(1) + self.code[key] = '' + elif key[-10:] == 'derivative': + self.code[key] = {} + for dkey in self.expressions[key].keys(): + self.expression_list.append(self.expressions[key][dkey]) + self.expression_keys.append([key, dkey]) + if key[:-10] == 'first' or key[:-10] == '': + self.expression_order.append(3) #sym.count_ops(self.expressions[key][dkey])) + elif key[:-10] == 'second': + self.expression_order.append(4) #sym.count_ops(self.expressions[key][dkey])) + elif key[:-10] == 'third': + self.expression_order.append(5) #sym.count_ops(self.expressions[key][dkey])) + self.code[key][dkey] = '' + else: + self.expression_list.append(self.expressions[key]) + self.expression_keys.append([key]) + self.expression_order.append(2) + self.code[key] = '' + + # This step may be unecessary. + # Not 100% sure if the sub expression elimination is order sensitive. This step orders the list with the 'function' code first and derivatives after. + self.expression_order, self.expression_list, self.expression_keys = zip(*sorted(zip(self.expression_order, self.expression_list, self.expression_keys))) + + + def _gen_code(self, cache_prefix = 'cache', sub_prefix = 'sub', prefix='XoXoXoX'): + """Generate code for the list of expressions provided using the common sub-expression eliminator to separate out portions that are computed multiple times.""" + # This is the dictionary that stores all the generated code. + self.code = {} + + # Convert the expressions to a list for common sub expression elimination + # We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'. + self.update_expression_list() + + # Apply any global stabilisation operations to expressions. + self.stabilise() + + # Helper functions to get data in and out of dictionaries. + # this code from http://stackoverflow.com/questions/14692690/access-python-nested-dictionary-items-via-a-list-of-keys + def getFromDict(dataDict, mapList): + return reduce(lambda d, k: d[k], mapList, dataDict) + def setInDict(dataDict, mapList, value): + getFromDict(dataDict, mapList[:-1])[mapList[-1]] = value + + + # Do the common sub expression elimination + subexpressions, expression_substituted_list = sym.cse(self.expression_list, numbered_symbols(prefix=prefix)) + cacheable_list = [] + + # Sort out any expression that's dependent on something that scales with data size (these are listed in cacheable). + self.expressions['params_change'] = [] + self.expressions['update_cache'] = [] + cache_expressions_list = [] + sub_expression_list = [] + for expr in subexpressions: + arg_list = [e for e in expr[1].atoms() if e.is_Symbol] + cacheable_symbols = [e for e in arg_list if e in cacheable_list or e in self.cacheable_vars] + if cacheable_symbols: + self.expressions['update_cache'].append((expr[0].name, self._expr2code(arg_list, expr[1]))) + # list which ensures dependencies are cacheable. + cacheable_list.append(expr[0]) + cache_expressions_list.append(expr[0].name) + else: + self.expressions['params_change'].append((expr[0].name, self._expr2code(arg_list, expr[1]))) + sub_expression_list.append(expr[0].name) + + # Replace original code with code including subexpressions. + for expr, keys in zip(expression_substituted_list, self.expression_keys): + arg_list = [e for e in expr.atoms() if e.is_Symbol] + setInDict(self.code, keys, self._expr2code(arg_list, expr)) + setInDict(self.expressions, keys, expr) + + # Create variable names for cache and sub expression portions + cache_dict = {} + self.variables[cache_prefix] = [] + for i, sub in enumerate(cache_expressions_list): + name = cache_prefix + str(i) + cache_dict[sub] = name + self.variables[cache_prefix].append(sym.var(name)) + + sub_dict = {} + self.variables[sub_prefix] = [] + for i, sub in enumerate(sub_expression_list): + name = sub_prefix + str(i) + sub_dict[sub] = name + self.variables[sub_prefix].append(sym.var(name)) + + # Replace sub expressions in main code with either cacheN or subN. + for key, val in cache_dict.iteritems(): + for keys in self.expression_keys: + setInDict(self.code, keys, + getFromDict(self.code,keys).replace(key, val)) + + for key, val in sub_dict.iteritems(): + for keys in self.expression_keys: + setInDict(self.code, keys, + getFromDict(self.code,keys).replace(key, val)) + + # Set up precompute code as either cacheN or subN. + self.code['update_cache'] = {} + for key, val in self.expressions['update_cache']: + expr = val + for key2, val2 in cache_dict.iteritems(): + expr = expr.replace(key2, val2) + for key2, val2 in sub_dict.iteritems(): + expr = expr.replace(key2, val2) + self.code['update_cache'][cache_dict[key]] = expr + + self.expressions['update_cache'] = dict(self.expressions['update_cache']) + self.code['params_change'] = {} + for key, val in self.expressions['params_change']: + expr = val + for key2, val2 in cache_dict.iteritems(): + expr = expr.replace(key2, val2) + for key2, val2 in sub_dict.iteritems(): + expr = expr.replace(key2, val2) + self.code['params_change'][sub_dict[key]] = expr + self.expressions['params_change'] = dict(self.expressions['params_change']) + + def _expr2code(self, arg_list, expr): + """Convert the given symbolic expression into code.""" + code = lambdastr(arg_list, expr) + function_code = code.split(':')[1].strip() + #for arg in arg_list: + # function_code = function_code.replace(arg.name, 'self.'+arg.name) + + return function_code + + def _print_code(self, code): + """Prepare code for string writing.""" + for key in self.variables.keys(): + for arg in self.variables[key]: + code = code.replace(arg.name, 'self.'+arg.name) + return code diff --git a/GPy/kern/_src/symbolic.py b/GPy/kern/_src/symbolic.py index 91193c5d..8f6d58c0 100644 --- a/GPy/kern/_src/symbolic.py +++ b/GPy/kern/_src/symbolic.py @@ -169,7 +169,7 @@ class Symbolic(Kern): def gradients_X(self, dL_dK, X, X2=None): #if self._X is None or X.base is not self._X.base or X2 is not None: self._K_computations(X, X2) - gradients_X = np.zeros((X.shape[0], X.shape[1])) + gradients_X = np.zeros_like(X) for i, x in enumerate(self._sym_x): gf = self._K_derivatives_code[x.name] gradients_X[:, i] = (gf(**self._arguments)*dL_dK).sum(1) diff --git a/GPy/likelihoods/symbolic.py b/GPy/likelihoods/symbolic.py index 370f4599..feac9e9d 100644 --- a/GPy/likelihoods/symbolic.py +++ b/GPy/likelihoods/symbolic.py @@ -5,7 +5,6 @@ try: import sympy as sym sympy_available=True from sympy.utilities.lambdify import lambdify - from GPy.util.symbolic import stabilise except ImportError: sympy_available=False diff --git a/GPy/mappings/symbolic.py b/GPy/mappings/symbolic.py new file mode 100644 index 00000000..c55ca983 --- /dev/null +++ b/GPy/mappings/symbolic.py @@ -0,0 +1,56 @@ +# Copyright (c) 2014 GPy Authors +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import sympy as sym +from ..core.mapping import Mapping, Bijective_mapping +from ..core.symbolic import Symbolic_core +import numpy as np + +class Symbolic(Mapping, Symbolic_core): + """ + Symbolic mapping + + Mapping where the form of the mapping is provided by a sympy expression. + + """ + def __init__(self, input_dim, output_dim, f=None, name='symbolic', param=None, func_modules=[]): + + + if f is None: + raise ValueError, "You must provide an argument for the function." + + Mapping.__init__(self, input_dim, output_dim, name=name) + Symbolic_core.__init__(self, f, ['X'], derivatives = ['X', 'theta'], param=param, func_modules=func_modules) + + self._initialize_cache() + self.parameters_changed() + + def _initialize_cache(self): + self.x_0 = np.random.normal(size=(3, self.input_dim)) + + + def parameters_changed(self): + self.eval_parameters_changed() + + def update_cache(self, X): + self.eval_update_cache(X) + + def update_gradients(self, partial, X): + self.eval_update_gradients(partial, X) + + def gradients_X(self, partial, X): + return self.eval_gradients_X(partial, X) + + def f(self, X): + """ + """ + return self.eval_f(X) + + + def df_dX(self, X): + """ + """ + pass + + def df_dtheta(self, X): + pass diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index f53163f4..04f09d3e 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -304,8 +304,8 @@ def football_data(season='1314', data_set='football_data'): data_set_season = data_set + '_' + season data_resources[data_set_season] = copy.deepcopy(data_resources[data_set]) data_resources[data_set_season]['urls'][0]+=season + '/' - start_year = int(year[0:2]) - end_year = int(year[2:4]) + start_year = int(season[0:2]) + end_year = int(season[2:4]) files = ['E0.csv', 'E1.csv', 'E2.csv', 'E3.csv'] if start_year>4 and start_year < 93: files += ['EC.csv'] diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py index 5644fb76..59837cb0 100644 --- a/GPy/util/symbolic.py +++ b/GPy/util/symbolic.py @@ -1,100 +1,7 @@ +import sys +import numpy as np import sympy as sym from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma, polygamma -from sympy.utilities.lambdify import lambdastr -from sympy.utilities.iterables import numbered_symbols -def stabilise(e): - """Attempt to find the most numerically stable form of an expression""" - return e #sym.expand(e) - - -def gen_code(expressions, cache_prefix = 'cache', sub_prefix = 'sub', prefix='XoXoXoX', cacheable=[]): - """Generate code for the list of expressions provided using the common sub-expression eliminator to separate out portions that are computed multiple times.""" - # First convert the expressions to a list. - # We should find the following type of expressions: 'function', 'derivative', 'second_derivative', 'third_derivative'. - - # Helper functions to get data in and out of dictionaries. - # this code from http://stackoverflow.com/questions/14692690/access-python-nested-dictionary-items-via-a-list-of-keys - def getFromDict(dataDict, mapList): - return reduce(lambda d, k: d[k], mapList, dataDict) - def setInDict(dataDict, mapList, value): - getFromDict(dataDict, mapList[:-1])[mapList[-1]] = value - - # This is the return dictionary that stores all the generated code. - code = {} - expression_list = [] - key_list = [] - order_list = [] - code['main'] = {} - for key in expressions.keys(): - if key == 'function': - expression_list.append(expressions[key]) - key_list.append([key]) - order_list.append(1) - code['main'][key] = '' - elif key[-10:] == 'derivative': - code['main'][key] = {} - for dkey in expressions[key].keys(): - expression_list.append(expressions[key][dkey]) - key_list.append([key, dkey]) - if key[:-10] == 'first' or key[:-10] == '': - order_list.append(3) #sym.count_ops(expressions[key][dkey])) - elif key[:-10] == 'second': - order_list.append(4) #sym.count_ops(expressions[key][dkey])) - elif key[:-10] == 'third': - order_list.append(5) #sym.count_ops(expressions[key][dkey])) - code['main'][key][dkey] = '' - else: - expression_list.append(expressions[key]) - key_list.append([key]) - order_list.append(2) - code['main'][key] = '' - - # This step may be unecessary. - # Not 100% sure if the sub expression elimination is order sensitive. This step orders the list with the 'function' code first and derivatives after. - order_list, expression_list, key_list = zip(*sorted(zip(order_list, expression_list, key_list))) - - print expression_list - - subexpressions, expression_substituted_list = sym.cse(expression_list, numbered_symbols(prefix=prefix)) - cacheable_list = [] - # Create strings that lambda strings from the expressions. - code['params_change'] = [] - code['cache'] = [] - for expr in subexpressions: - arg_list = [e for e in expr[1].atoms() if e.is_Symbol] - cacheable_symbols = [e for e in arg_list if e in cacheable_list or e in cacheable] - if cacheable_symbols: - code['cacheable'].append((expr[0],expr2code(arg_list, expr[1]))) - # list which ensures dependencies are cacheable. - cacheable_list.append(expr[0]) - code['cacheexpressions'].append(expr[0]) - else: - code['params_change'].append((expr[0],expr2code(arg_list, expr[1]))) - code['subexpressions'].append(expr[0]) - - for expr, keys in zip(expression_substituted_list, key_list): - arg_list = [e for e in expr.atoms() if e.is_Symbol] - setInDict(code['main'], keys, expr2code(arg_list, expr)) - setInDict(expressions, keys, expr) - - sub_dict = {} - for i, sub in enumerate(code['cacheexpressions']): - sub_dict[sub.name] = cache_prefix + str(i) - for i, sub in enumerate(code['subexpressions']): - sub_dict[sub.name] = sub_prefix + str(i) - - - - return code - -def expr2code(arg_list, expr): - """Convert the given symbolic expression into code.""" - code = lambdastr(arg_list, expr) - function_code = code.split(':')[1] - for arg in arg_list: - function_code = function_code.replace(arg.name, 'self.'+arg.name) - - return function_code class logistic(Function): """The logistic function as a symbolic function.""" From 8abc45c4caff22c7c3742af2108496c529c61b99 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Thu, 17 Apr 2014 15:01:43 +0100 Subject: [PATCH 7/7] bugfix: mixed up global and local index in unfixing --- GPy/core/parameterization/index_operations.py | 2 +- GPy/core/parameterization/parameter_core.py | 35 ++++++++++++------- GPy/kern/__init__.py | 4 +-- GPy/testing/index_operations_tests.py | 15 ++++++-- GPy/testing/parameterized_tests.py | 12 +++++++ 5 files changed, 49 insertions(+), 19 deletions(-) diff --git a/GPy/core/parameterization/index_operations.py b/GPy/core/parameterization/index_operations.py index 12b3a298..1f3ac934 100644 --- a/GPy/core/parameterization/index_operations.py +++ b/GPy/core/parameterization/index_operations.py @@ -184,7 +184,7 @@ class ParameterIndexOperationsView(object): def remove(self, prop, indices): removed = self._param_index_ops.remove(prop, numpy.array(indices)+self._offset) if removed.size > 0: - return removed - self._size + 1 + return removed-self._offset return removed diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index b513ba44..68140763 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -312,7 +312,8 @@ class Indexable(object): This does not need to account for shaped parameters, as it basically just sums up the parameter sizes which come before param. """ - raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?" + return 0 + #raise NotImplementedError, "shouldnt happen, offset required from non parameterization object?" def _raveled_index_for(self, param): """ @@ -320,7 +321,8 @@ class Indexable(object): that is an int array, containing the indexes for the flattened param inside this parameterized logic. """ - raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?" + return param._raveled_index() + #raise NotImplementedError, "shouldnt happen, raveld index transformation required from non parameterization object?" class Constrainable(Nameable, Indexable, Observable): @@ -368,10 +370,10 @@ class Constrainable(Nameable, Indexable, Observable): if value is not None: self[:] = value reconstrained = self.unconstrain() - self._add_to_index_operations(self.constraints, reconstrained, __fixed__, warning) - rav_i = self._highest_parent_._raveled_index_for(self) - self._highest_parent_._set_fixed(rav_i) + index = self._add_to_index_operations(self.constraints, reconstrained, __fixed__, warning) + self._highest_parent_._set_fixed(self, index) self.notify_observers(self, None if trigger_parent else -np.inf) + return index fix = constrain_fixed def unconstrain_fixed(self): @@ -379,7 +381,8 @@ class Constrainable(Nameable, Indexable, Observable): This parameter will no longer be fixed. """ unconstrained = self.unconstrain(__fixed__) - self._highest_parent_._set_unfixed(unconstrained) + self._highest_parent_._set_unfixed(self, unconstrained) + return unconstrained unfix = unconstrain_fixed def _ensure_fixes(self): @@ -388,14 +391,16 @@ class Constrainable(Nameable, Indexable, Observable): # Param: ones(self._realsize_ if not self._has_fixes(): self._fixes_ = np.ones(self.size, dtype=bool) - def _set_fixed(self, index): + def _set_fixed(self, param, index): self._ensure_fixes() - self._fixes_[index] = FIXED + offset = self._offset_for(param) + self._fixes_[index+offset] = FIXED if np.all(self._fixes_): self._fixes_ = None # ==UNFIXED - def _set_unfixed(self, index): + def _set_unfixed(self, param, index): self._ensure_fixes() - self._fixes_[index] = UNFIXED + offset = self._offset_for(param) + self._fixes_[index+offset] = UNFIXED if np.all(self._fixes_): self._fixes_ = None # ==UNFIXED def _connect_fixes(self): @@ -469,8 +474,9 @@ class Constrainable(Nameable, Indexable, Observable): """ self.param_array[...] = transform.initialize(self.param_array) reconstrained = self.unconstrain() - self._add_to_index_operations(self.constraints, reconstrained, transform, warning) + added = self._add_to_index_operations(self.constraints, reconstrained, transform, warning) self.notify_observers(self, None if trigger_parent else -np.inf) + return added def unconstrain(self, *transforms): """ @@ -549,7 +555,9 @@ class Constrainable(Nameable, Indexable, Observable): if warning and reconstrained.size > 0: # TODO: figure out which parameters have changed and only print those print "WARNING: reconstraining parameters {}".format(self.parameter_names() or self.name) - which.add(what, self._raveled_index()) + index = self._raveled_index() + which.add(what, index) + return index def _remove_from_index_operations(self, which, transforms): """ @@ -561,9 +569,10 @@ class Constrainable(Nameable, Indexable, Observable): removed = np.empty((0,), dtype=int) for t in transforms: unconstrained = which.remove(t, self._raveled_index()) + print unconstrained removed = np.union1d(removed, unconstrained) if t is __fixed__: - self._highest_parent_._set_unfixed(unconstrained) + self._highest_parent_._set_unfixed(self, unconstrained) return removed diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index ef99e9a6..14378f55 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -20,6 +20,6 @@ except ImportError: if sympy_available: from _src.symbolic import Symbolic - from _src.heat_eqinit import Heat_eqinit - from _src.ode1_eq_lfm import Ode1_eq_lfm + #from _src.heat_eqinit import Heat_eqinit + #from _src.ode1_eq_lfm import Ode1_eq_lfm diff --git a/GPy/testing/index_operations_tests.py b/GPy/testing/index_operations_tests.py index 37cec10b..49cc844a 100644 --- a/GPy/testing/index_operations_tests.py +++ b/GPy/testing/index_operations_tests.py @@ -24,12 +24,14 @@ class Test(unittest.TestCase): self.assertDictEqual(self.param_index._properties, {}) def test_remove(self): - self.param_index.remove(three, np.r_[3:10]) + removed = self.param_index.remove(three, np.r_[3:10]) + self.assertListEqual(removed.tolist(), [4, 7]) self.assertListEqual(self.param_index[three].tolist(), [2]) - self.param_index.remove(one, [1]) + removed = self.param_index.remove(one, [1]) + self.assertListEqual(removed.tolist(), []) self.assertListEqual(self.param_index[one].tolist(), [3]) self.assertListEqual(self.param_index.remove('not in there', []).tolist(), []) - self.param_index.remove(one, [9]) + removed = self.param_index.remove(one, [9]) self.assertListEqual(self.param_index[one].tolist(), [3]) self.assertListEqual(self.param_index.remove('not in there', [2,3,4]).tolist(), []) @@ -78,6 +80,13 @@ class Test(unittest.TestCase): self.assertEqual(i, i2) self.assertTrue(np.all(v == v2)) + def test_indexview_remove(self): + removed = self.view.remove(two, [3]) + self.assertListEqual(removed.tolist(), [3]) + removed = self.view.remove(three, np.r_[:5]) + self.assertListEqual(removed.tolist(), [0, 2]) + + def test_misc(self): for k,v in self.param_index.copy()._properties.iteritems(): self.assertListEqual(self.param_index[k].tolist(), v.tolist()) diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py index 57669e93..fbdedc61 100644 --- a/GPy/testing/parameterized_tests.py +++ b/GPy/testing/parameterized_tests.py @@ -153,6 +153,18 @@ class ParameterizedTest(unittest.TestCase): self.testmodel.randomize() np.testing.assert_equal(variances, self.testmodel['.*var'].values()) + def test_fix_unfix(self): + fixed = self.testmodel.kern.lengthscale.fix() + self.assertListEqual(fixed.tolist(), [0]) + unfixed = self.testmodel.kern.lengthscale.unfix() + self.testmodel.kern.lengthscale.constrain_positive() + self.assertListEqual(unfixed.tolist(), [0]) + + fixed = self.testmodel.kern.fix() + self.assertListEqual(fixed.tolist(), [0,1]) + unfixed = self.testmodel.kern.unfix() + self.assertListEqual(unfixed.tolist(), [0,1]) + def test_printing(self): print self.test1 print self.param