From 2256127130102521f2b4b48cdc6b017d4f8458fc Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 05:23:36 +0100 Subject: [PATCH 01/11] Made openmp switch in only dependent on potion in rbf.py and linear.py --- GPy/kern/_src/linear.py | 51 +++++++++++++------ GPy/kern/_src/rbf.py | 42 ++++++++++----- .../matplot_dep/dim_reduction_plots.py | 2 +- GPy/plotting/matplot_dep/models_plots.py | 4 +- GPy/util/data_resources.json | 27 ++++++++-- GPy/util/datasets.py | 38 ++++++++++---- GPy/util/misc.py | 6 +-- 7 files changed, 122 insertions(+), 48 deletions(-) diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py index f9dacf02..3f696431 100644 --- a/GPy/kern/_src/linear.py +++ b/GPy/kern/_src/linear.py @@ -12,6 +12,7 @@ from ...core.parameterization.transformations import Logexp from ...util.caching import Cache_this from ...core.parameterization import variational from psi_comp import linear_psi_comp +from ...util.config import * class Linear(Kern): """ @@ -224,12 +225,23 @@ class Linear(Kern): AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :] AZZA = AZZA + AZZA.swapaxes(1, 2) AZZA_2 = AZZA/2. + if config.getboolean('parallel', 'openmp'): + pragma_string = '#pragma omp parallel for private(m,mm,q,qq,factor,tmp)' + header_string = '#include ' + weave_options = {'headers' : [''], + 'extra_compile_args': ['-fopenmp -O3'], + 'extra_link_args' : ['-lgomp'], + 'libraries': ['gomp']} + else: + pragma_string = '' + header_string = '' + weave_options = {'extra_compile_args': ['-O3']} #Using weave, we can exploit the symmetry of this problem: code = """ int n, m, mm,q,qq; double factor,tmp; - #pragma omp parallel for private(m,mm,q,qq,factor,tmp) + %s for(n=0;n + %s #include - """ - weave_options = {'headers' : [''], - 'extra_compile_args': ['-fopenmp -O3'], #-march=native'], - 'extra_link_args' : ['-lgomp']} + """ % header_string mu = vp.mean N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu) - weave.inline(code, support_code=support_code, libraries=['gomp'], + weave.inline(code, support_code=support_code, arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'], type_converters=weave.converters.blitz,**weave_options) def _weave_dpsi2_dZ(self, dL_dpsi2, Z, vp, target): AZA = self.variances*self._ZAinner(vp, Z) + + if config.getboolean('parallel', 'openmp'): + pragma_string = '#pragma omp parallel for private(n,mm,q)' + header_string = '#include ' + weave_options = {'headers' : [''], + 'extra_compile_args': ['-fopenmp -O3'], + 'extra_link_args' : ['-lgomp'], + 'libraries': ['gomp']} + else: + pragma_string = '' + header_string = '' + weave_options = {'extra_compile_args': ['-O3']} + code=""" int n,m,mm,q; - #pragma omp parallel for private(n,mm,q) + %s for(m=0;m + %s #include - """ - weave_options = {'headers' : [''], - 'extra_compile_args': ['-fopenmp -O3'], #-march=native'], - 'extra_link_args' : ['-lgomp']} + """ % header_string N,num_inducing,input_dim = vp.mean.shape[0],Z.shape[0],vp.mean.shape[1] mu = param_to_array(vp.mean) - weave.inline(code, support_code=support_code, libraries=['gomp'], + weave.inline(code, support_code=support_code, arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'], type_converters=weave.converters.blitz,**weave_options) diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index e0071fb9..5bc80871 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -10,6 +10,7 @@ from GPy.util.caching import Cache_this from ...core.parameterization import variational from psi_comp import ssrbf_psi_comp from psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF +from ...util.config import * class RBF(Stationary): """ @@ -231,6 +232,16 @@ class RBF(Stationary): @Cache_this(limit=1) def _psi2computations(self, Z, vp): + + if config.getboolean('parallel', 'openmp'): + pragma_string = '#pragma omp parallel for private(tmp, exponent_tmp)' + header_string = '#include ' + libraries = ['gomp'] + else: + pragma_string = '' + header_string = '' + libraries = [] + mu, S = vp.mean, vp.variance N, Q = mu.shape @@ -253,8 +264,7 @@ class RBF(Stationary): variance_sq = float(np.square(self.variance)) code = """ double tmp, exponent_tmp; - - #pragma omp parallel for private(tmp, exponent_tmp) + %s for (int n=0; n + %s #include - """ + """ % header_string mu = param_to_array(mu) - weave.inline(code, support_code=support_code, libraries=['gomp'], + weave.inline(code, support_code=support_code, libraries=libraries, arg_names=['N', 'M', 'Q', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'denom_l2', 'Zdist_sq', 'half_log_denom', 'psi2', 'variance_sq'], type_converters=weave.converters.blitz, **self.weave_options) @@ -303,12 +313,20 @@ class RBF(Stationary): #return 2.*np.einsum( 'ijk,ijk,ijkl,il->l', dL_dpsi2, psi2, Zdist_sq * (2.*S[:,None,None,:]/l2 + 1.) + mudist_sq + S[:, None, None, :] / l2, 1./(2.*S + l2))*self.lengthscale result = np.zeros(self.input_dim) + if config.getboolean('parallel', 'openmp'): + pragma_string = '#pragma omp parallel for reduction(+:tmp)' + header_string = '#include ' + libraries = ['gomp'] + else: + pragma_string = '' + header_string = '' + libraries = [] code = """ double tmp; for(int q=0; q + %s #include - """ + """ % header_string N,Q = S.shape M = psi2.shape[-1] S = param_to_array(S) - weave.inline(code, support_code=support_code, libraries=['gomp'], + weave.inline(code, support_code=support_code, libraries=libraries, arg_names=['psi2', 'dL_dpsi2', 'N', 'M', 'Q', 'mudist_sq', 'l2', 'Zdist_sq', 'S', 'result'], type_converters=weave.converters.blitz, **self.weave_options) diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py index ca2c890f..71e08c6b 100644 --- a/GPy/plotting/matplot_dep/dim_reduction_plots.py +++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py @@ -97,7 +97,7 @@ def plot_latent(model, labels=None, which_indices=None, elif type(ul) is np.int64: this_label = 'class %i' % ul else: - this_label = 'class %i' % i + this_label = unicode(i) m = marker.next() index = np.nonzero(labels == ul)[0] diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py index 57b64ae5..84747d05 100644 --- a/GPy/plotting/matplot_dep/models_plots.py +++ b/GPy/plotting/matplot_dep/models_plots.py @@ -14,7 +14,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all', which_data_ycols='all', fixed_inputs=[], levels=20, samples=0, fignum=None, ax=None, resolution=None, plot_raw=False, - linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None): + linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx'): """ Plot the posterior of the GP. - In one dimension, the function is plotted with a shaded region identifying two standard deviations. @@ -97,7 +97,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all', for d in which_data_ycols: plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol) - plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5) + plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5) #optionally plot some samples if samples: #NOTE not tested with fixed_inputs diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 51070650..6cc692e8 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -150,6 +150,26 @@ ] }, "fruitfly_tomancak": { + "citation": "", + "details": "", + "files": [ + [ + "tomancak_exprs.csv", + "tomancak_se.csv", + "tomancak_prctile5.csv", + "tomancak_prctile25.csv", + "tomancak_prctile50.csv", + "tomancak_prctile75.csv", + "tomancak_prctile95.csv" + ] + ], + "license": null, + "size": 59000000, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/fruitfly_tomancak/" + ] + }, + "fruitfly_tomancak_cel_files": { "citation": "'Systematic determination of patterns of gene expression during Drosophila embryogenesis' Pavel Tomancak, Amy Beaton, Richard Weiszmann, Elaine Kwan, ShengQiang Shu, Suzanna E Lewis, Stephen Richards, Michael Ashburner, Volker Hartenstein, Susan E Celniker, and Gerald M Rubin", "details": "Gene expression results from blastoderm development in Drosophila Melanogaster.", "files": [ @@ -198,7 +218,7 @@ ] ], "license": null, - "size": 1, + "size": 389000000, "urls": [ "ftp://ftp.fruitfly.org/pub/embryo_tc_array_data/" ] @@ -217,6 +237,7 @@ "http://www.google.com/trends/" ] }, + "hapmap3": { "citation": "Gibbs, Richard A., et al. 'The international HapMap project.' Nature 426.6968 (2003): 789-796.", "details": "HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\\\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", @@ -434,7 +455,7 @@ }, "singlecell": { "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", - "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + "details": "qPCR TaqMan array single cell experiment in mouse. The data is taken from the early stages of development when the Blastocyst is forming. At the 32 cell stage the data is already separated into the trophectoderm (TE) which goes onto form the placenta and the inner cellular mass (ICM). The ICM further differentiates into the epiblast (EPI)---which gives rise to the endoderm, mesoderm and ectoderm---and the primitive endoderm (PE) which develops into the amniotic sack. Guo et al selected 48 genes for expression measurement. They labelled the resulting cells and their labels are included as an aide to visualization.", "files": [ [ "singlecell.csv" @@ -443,7 +464,7 @@ "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", "size": 233.1, "urls": [ - "http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/" + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/" ] }, "swiss_roll": { diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index bdd55066..c18431ef 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -112,7 +112,7 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix if content_length_str: file_size = int(content_length_str[0]) else: - file_size = 1e10 + file_size = None status = "" file_size_dl = 0 block_sz = 8192 @@ -124,9 +124,15 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix file_size_dl += len(buff) f.write(buff) sys.stdout.write(" "*(len(status)) + "\r") - status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), - full=file_size/(1.*1e6), ll=line_length, + if file_size: + status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1048576.), + full=file_size/(1048576.), ll=line_length, perc="="*int(line_length*float(file_size_dl)/file_size)) + else: + status = r"[{perc: <{ll}}] {dl:7.3f}MB".format(dl=file_size_dl/(1048576.), + ll=line_length, + perc="."*int(line_length*float(file_size_dl/(10*1048576.)))) + sys.stdout.write(status) sys.stdout.flush() sys.stdout.write(" "*(len(status)) + "\r") @@ -357,8 +363,15 @@ def football_data(season='1314', data_set='football_data'): def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): if not data_available(data_set): download_data(data_set) - X = None - Y = None + from pandas import read_csv + filename = os.path.join(data_path, 'tomancak_expr.csv') + Y = read_csv(filename, header=0, index_col=0).T + num_repeats = 3 + num_time = 12 + xt = np.linspace(0, num_time-1, num_time) + xr = np.linspace(0, num_repeats-1, num_repeats) + xtime, xrepeat = np.meshgrid(xt, xr) + X = np.vstack((xtime.flatten(), xrepeat.flatten())).T return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set) # This will be for downloading google trends data. @@ -732,13 +745,16 @@ def hapmap3(data_set='hapmap3'): def singlecell(data_set='singlecell'): if not data_available(data_set): download_data(data_set) + + from pandas import read_csv dirpath = os.path.join(data_path, data_set) - data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) - genes = data[0, 1:] - labels = data[1:, 0] - Y = np.array(data[1:, 1:], dtype=float) - return data_details_return({'Y': Y, 'info' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", - 'genes':genes, 'labels':labels, + filename = os.path.join(dirpath, 'singlecell.csv') + Y = read_csv(filename, header=0, index_col=0) + genes = Y.columns + labels = Y.index + # data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) + return data_details_return({'Y': Y, 'info' : "qPCR singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + 'genes': genes, 'labels':labels, }, data_set) def swiss_roll_1000(): diff --git a/GPy/util/misc.py b/GPy/util/misc.py index dc327324..fa9bb24c 100644 --- a/GPy/util/misc.py +++ b/GPy/util/misc.py @@ -130,14 +130,14 @@ def fast_array_equal(A, B): """ % pragma_string if config.getboolean('parallel', 'openmp'): - pragma_string = '#include ' + header_string = '#include ' else: - pragma_string = '' + header_string = '' support_code = """ %s #include - """ % pragma_string + """ % header_string weave_options_openmp = {'headers' : [''], From 8ff4a42d1a392906390edde8a6f7169a595cb7d1 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 09:32:58 +0100 Subject: [PATCH 02/11] [param_array] doc --- GPy/core/parameterization/parameter_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 93924678..5113b8d9 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -713,6 +713,10 @@ class Parameterizable(OptimizationHandlable): @property def param_array(self): + """ + Array representing the parameters of this class. + There is only one copy of all parameters in memory, two during optimization. + """ if self._param_array_ is None: self._param_array_ = np.empty(self.size, dtype=np.float64) return self._param_array_ From f110bbd4c8e96908a77b5d3c93635051a83373dd Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 09:33:35 +0100 Subject: [PATCH 03/11] [bgplvm] init lengthscale as 0./var --- GPy/models/bayesian_gplvm.py | 2 +- GPy/util/initialization.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index 03cd361c..2bcbe0b2 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -42,7 +42,7 @@ class BayesianGPLVM(SparseGP): assert Z.shape[1] == X.shape[1] if kernel is None: - kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim) + kernel = kern.RBF(input_dim, lengthscale=1./fracs, ARD=True) # + kern.white(input_dim) if likelihood is None: likelihood = Gaussian() diff --git a/GPy/util/initialization.py b/GPy/util/initialization.py index 8d23b541..dd3b6ec7 100644 --- a/GPy/util/initialization.py +++ b/GPy/util/initialization.py @@ -13,11 +13,11 @@ def initialize_latent(init, input_dim, Y): p = pca(Y) PC = p.project(Y, min(input_dim, Y.shape[1])) Xr[:PC.shape[0], :PC.shape[1]] = PC - vars = p.fracs[:input_dim] + var = p.fracs[:input_dim] else: - vars = Xr.var(0) + var = Xr.var(0) Xr -= Xr.mean(0) Xr /= Xr.var(0) - return Xr, vars/vars.max() \ No newline at end of file + return Xr, var/var.max() \ No newline at end of file From db644408ea74858a8f23bd79a10d48b1b46dc39d Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 12:17:42 +0100 Subject: [PATCH 04/11] Add ordinal and attempt to fix downloads --- GPy/gpy_config.cfg | 4 ++ GPy/likelihoods/ordinal.py | 48 +++++++++++++++++++ .../matplot_dep/dim_reduction_plots.py | 2 +- GPy/util/data_resources.json | 15 ++++++ GPy/util/datasets.py | 21 +++++++- 5 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 GPy/likelihoods/ordinal.py diff --git a/GPy/gpy_config.cfg b/GPy/gpy_config.cfg index db90dbf6..43cd0ebe 100644 --- a/GPy/gpy_config.cfg +++ b/GPy/gpy_config.cfg @@ -6,6 +6,10 @@ # some platforms, hence this option. openmp=False +[datasets] +# location for the local data cache +dir=$HOME/tmp/GPy-datasets/ + [anaconda] # if you have an anaconda python installation please specify it here. installed = False diff --git a/GPy/likelihoods/ordinal.py b/GPy/likelihoods/ordinal.py new file mode 100644 index 00000000..4ac204fd --- /dev/null +++ b/GPy/likelihoods/ordinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2014 The GPy authors (see AUTHORS.txt) +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import sympy as sym +from GPy.util.symbolic import gammaln, normcdfln, normcdf, IndMatrix, create_matrix +import numpy as np +from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf +import link_functions +from symbolic import Symbolic +from scipy import stats + +class Ordinal(Symbolic): + """ + Ordinal + + .. math:: + p(y_{i}|\pi(f_{i})) = \left(\frac{r}{r+f_i}\right)^r \frac{\Gamma(r+y_i)}{y!\Gamma(r)}\left(\frac{f_i}{r+f_i}\right)^{y_i} + + .. Note:: + Y takes non zero integer values.. + link function should have a positive domain, e.g. log (default). + + .. See also:: + symbolic.py, for the parent class + """ + def __init__(self, categories=3, gp_link=None): + if gp_link is None: + gp_link = link_functions.Identity() + + dispersion = sym.Symbol('width', positive=True, real=True) + y_0 = sym.Symbol('y_0', nonnegative=True, integer=True) + f_0 = sym.Symbol('f_0', positive=True, real=True) + log_pdf = create_matrix('log_pdf', 1, categories) + log_pdf[0] = normcdfln(-f_0) + if categories>2: + w = create_matrix('w', 1, categories) + log_pdf[categories-1] = normcdfln(w.sum() + f_0) + for i in range(1, categories-1): + log_pdf[i] = sym.log(normcdf(w[0, 0:i-1].sum() + f_0) - normcdf(w[0, 0:i].sum()-f_0) ) + else: + log_pdf[1] = normcdfln(f_0) + log_pdf.index_var = y_0 + super(Ordinal, self).__init__(log_pdf=log_pdf, gp_link=gp_link, name='Ordinal') + + # TODO: Check this. + self.log_concave = True + diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py index 71e08c6b..f8413671 100644 --- a/GPy/plotting/matplot_dep/dim_reduction_plots.py +++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py @@ -97,7 +97,7 @@ def plot_latent(model, labels=None, which_indices=None, elif type(ul) is np.int64: this_label = 'class %i' % ul else: - this_label = unicode(i) + this_label = unicode(ul) m = marker.next() index = np.nonzero(labels == ul)[0] diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 6cc692e8..d6640295 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -467,6 +467,21 @@ "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/" ] }, + "sod1_mouse": { + "citation": "Transcriptomic indices of fast and slow disease progression in two mouse models of amyotrophic lateral sclerosis' Nardo G1, Iennaco R, Fusi N, Heath PR, Marino M, Trolese MC, Ferraiuolo L, Lawrence N, Shaw PJ, Bendotti C Brain. 2013 Nov;136(Pt 11):3305-32. doi: 10.1093/brain/awt250. Epub 2013 Sep 24.", + "details": "Gene expression data from two separate strains of mice: C57 and 129Sv in wild type and SOD1 mutant strains.", + "files": [ + [ + "sod1_C59_129_exprs.csv", + "sod1_C59_129_se.csv" + ] + ], + "license": null, + "size": 0, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/sod1_mouse/" + ] + }, "swiss_roll": { "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index c18431ef..05e4013e 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -12,6 +12,8 @@ import datetime import json import re +from config import * + ipython_available=True try: import IPython @@ -29,7 +31,8 @@ def reporthook(a,b,c): sys.stdout.flush() # Global variables -data_path = os.path.join(os.path.dirname(__file__), 'datasets') +data_path = os.path.expandvar(config.get('datasets', 'dir')) +#data_path = os.path.join(os.path.dirname(__file__), 'datasets') default_seed = 10000 overide_manual_authorize=False neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' @@ -360,11 +363,25 @@ def football_data(season='1314', data_set='football_data'): Y = table[:, 4:] return data_details_return({'X': X, 'Y': Y}, data_set) +def sod1_mouse(data_set='sod1_mouse'): + if not data_available(data_set): + download_data(data_set) + from pandas import read_csv + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'sod1_C57_129_exprs.csv') + Y = read_csv(filename, header=0, index_col=0).T + num_repeats=4 + num_time=4 + num_cond=4 + X = 1 + return data_details_return({'X': X, 'Y': Y}, data_set) + def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): if not data_available(data_set): download_data(data_set) from pandas import read_csv - filename = os.path.join(data_path, 'tomancak_expr.csv') + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'tomancak_expr.csv') Y = read_csv(filename, header=0, index_col=0).T num_repeats = 3 num_time = 12 From 851e63476ce7ca186c97bb1e6a68ed420f35f715 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 12:53:42 +0100 Subject: [PATCH 05/11] [pydot] build pydot with new observer list --- GPy/core/parameterization/lists_and_dicts.py | 1 + GPy/core/parameterization/param.py | 4 ++-- GPy/core/parameterization/parameterized.py | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py index 13547c94..64bdb077 100644 --- a/GPy/core/parameterization/lists_and_dicts.py +++ b/GPy/core/parameterization/lists_and_dicts.py @@ -57,6 +57,7 @@ class ObservablesList(object): def __repr__(self): return self._poc.__repr__() + def add(self, priority, observable, callble): if observable is not None: diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 19e48d84..91bf3561 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -57,9 +57,9 @@ class Param(OptimizationHandlable, ObsAr): def build_pydot(self,G): import pydot - node = pydot.Node(id(self), shape='record', label=self.name) + node = pydot.Node(id(self), shape='trapezium', label=self.name)#, fontcolor='white', color='white') G.add_node(node) - for o in self.observers.keys(): + for _, o, _ in self.observers: label = o.name if hasattr(o, 'name') else str(o) observed_node = pydot.Node(id(o), label=label) G.add_node(observed_node) diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py index 738f0485..67694a1b 100644 --- a/GPy/core/parameterization/parameterized.py +++ b/GPy/core/parameterization/parameterized.py @@ -82,15 +82,15 @@ class Parameterized(Parameterizable): import pydot # @UnresolvedImport iamroot = False if G is None: - G = pydot.Dot(graph_type='digraph') + G = pydot.Dot(graph_type='digraph', bgcolor=None) iamroot=True - node = pydot.Node(id(self), shape='record', label=self.name) + node = pydot.Node(id(self), shape='box', label=self.name)#, color='white') G.add_node(node) for child in self._parameters_: child_node = child.build_pydot(G) - G.add_edge(pydot.Edge(node, child_node)) + G.add_edge(pydot.Edge(node, child_node))#, color='white')) - for o in self.observers.keys(): + for _, o, _ in self.observers: label = o.name if hasattr(o, 'name') else str(o) observed_node = pydot.Node(id(o), label=label) G.add_node(observed_node) From 442bc3f58199678e26e03b37a865ea1bb3975880 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 14:20:59 +0100 Subject: [PATCH 06/11] [paramcore] fix for traversal --- GPy/core/parameterization/parameter_core.py | 22 ++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 5113b8d9..ba85be03 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -599,14 +599,13 @@ class OptimizationHandlable(Constrainable): return p def _set_params_transformed(self, p): - if p is self.param_array: - p = p.copy() - if self.has_parent() and self.constraints[__fixed__].size != 0: - fixes = np.ones(self.size).astype(bool) - fixes[self.constraints[__fixed__]] = FIXED - self.param_array.flat[fixes] = p - elif self._has_fixes(): self.param_array.flat[self._fixes_] = p - else: self.param_array.flat = p + if not(p is self.param_array): + if self.has_parent() and self.constraints[__fixed__].size != 0: + fixes = np.ones(self.size).astype(bool) + fixes[self.constraints[__fixed__]] = FIXED + self.param_array.flat[fixes] = p + elif self._has_fixes(): self.param_array.flat[self._fixes_] = p + else: self.param_array.flat = p self.untransform() self._trigger_params_changed() @@ -621,7 +620,7 @@ class OptimizationHandlable(Constrainable): def num_params(self): """ Return the number of parameters of this parameter_handle. - Param objects will allways return 0. + Param objects will always return 0. """ raise NotImplemented, "Abstract, please implement in respective classes" @@ -742,14 +741,15 @@ class Parameterizable(OptimizationHandlable): self.__visited = True for c in self._parameters_: c.traverse(visit, *args, **kwargs) + self.__visited = False def traverse_parents(self, visit, *args, **kwargs): """ Traverse the hierarchy upwards, visiting all parents and their children. See "visitor pattern" in literature. This is implemented in pre-order fashion. - + Example: - + parents = [] self.traverse_parents(parents.append) print parents From 2953e6b73b02c98db81c0ea0d7521bb564c9029e Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 17:00:02 +0100 Subject: [PATCH 07/11] Add ordinal and attempt to fix downloads --- GPy/util/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 05e4013e..9f8e1938 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -31,7 +31,7 @@ def reporthook(a,b,c): sys.stdout.flush() # Global variables -data_path = os.path.expandvar(config.get('datasets', 'dir')) +data_path = os.path.expandvars(config.get('datasets', 'dir')) #data_path = os.path.join(os.path.dirname(__file__), 'datasets') default_seed = 10000 overide_manual_authorize=False From afc74b02cec98a8045e3a9ba864598923c2046c7 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 17:02:24 +0100 Subject: [PATCH 08/11] Sod1 Download --- GPy/util/data_resources.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index d6640295..58c2157c 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -479,7 +479,7 @@ "license": null, "size": 0, "urls": [ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/sod1_mouse/" + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/sod1_mouse/" ] }, "swiss_roll": { From 62920c2811cbce58f9935cf9e900cb07d472d7c8 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 17:08:51 +0100 Subject: [PATCH 09/11] Made openmp switch in only dependent on potion in rbf.py and linear.py --- GPy/util/data_resources.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 58c2157c..61050f9d 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -472,8 +472,8 @@ "details": "Gene expression data from two separate strains of mice: C57 and 129Sv in wild type and SOD1 mutant strains.", "files": [ [ - "sod1_C59_129_exprs.csv", - "sod1_C59_129_se.csv" + "sod1_C57_129_exprs.csv", + "sod1_C57_129_se.csv" ] ], "license": null, From cff37293d9e0d1fdd6a655ff5e64e84425fb0d28 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 17:20:57 +0100 Subject: [PATCH 10/11] Fixing fruitfly_tomancak data load. --- GPy/util/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 9f8e1938..81a6fabd 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -369,7 +369,7 @@ def sod1_mouse(data_set='sod1_mouse'): from pandas import read_csv dirpath = os.path.join(data_path, data_set) filename = os.path.join(dirpath, 'sod1_C57_129_exprs.csv') - Y = read_csv(filename, header=0, index_col=0).T + Y = read_csv(filename, header=0, index_col=0) num_repeats=4 num_time=4 num_cond=4 @@ -381,7 +381,7 @@ def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): download_data(data_set) from pandas import read_csv dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'tomancak_expr.csv') + filename = os.path.join(dirpath, 'tomancak_exprs.csv') Y = read_csv(filename, header=0, index_col=0).T num_repeats = 3 num_time = 12 From 8d6eed60108fdfc668ea8b9d47a6545392767e27 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Wed, 14 May 2014 08:53:56 +0100 Subject: [PATCH 11/11] [param] hierarchy traversal easier now --- GPy/core/parameterization/lists_and_dicts.py | 8 ++- GPy/core/parameterization/param.py | 23 +++++++- GPy/core/parameterization/parameter_core.py | 56 ++++++++++---------- GPy/testing/pickle_tests.py | 3 ++ 4 files changed, 57 insertions(+), 33 deletions(-) diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py index 64bdb077..084ab0db 100644 --- a/GPy/core/parameterization/lists_and_dicts.py +++ b/GPy/core/parameterization/lists_and_dicts.py @@ -88,19 +88,17 @@ class ObservablesList(object): def __iter__(self): self.flush() for p, o, c in self._poc: - if o() is not None: - yield p, o(), c + yield p, o(), c def __len__(self): self.flush() return self._poc.__len__() def __deepcopy__(self, memo): - self.flush() s = ObservablesList() - for p,o,c in self._poc: + for p,o,c in self: import copy - s.add(p, copy.deepcopy(o(), memo), copy.deepcopy(c, memo)) + s.add(p, copy.deepcopy(o, memo), copy.deepcopy(c, memo)) s.flush() return s diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 91bf3561..920072d7 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -169,8 +169,29 @@ class Param(OptimizationHandlable, ObsAr): # parameterizable #=========================================================================== def traverse(self, visit, *args, **kwargs): - visit(self, *args, **kwargs) + """ + Traverse the hierarchy performing visit(self, *args, **kwargs) at every node passed by. + See "visitor pattern" in literature. This is implemented in pre-order fashion. + This will function will just call visit on self, as Param are leaf nodes. + """ + visit(self, *args, **kwargs) + + def traverse_parents(self, visit, *args, **kwargs): + """ + Traverse the hierarchy upwards, visiting all parents and their children, except self. + See "visitor pattern" in literature. This is implemented in pre-order fashion. + + Example: + + parents = [] + self.traverse_parents(parents.append) + print parents + """ + if self.has_parent(): + self.__visited = True + self._parent_._traverse_parents(visit, *args, **kwargs) + self.__visited = False #=========================================================================== # Convenience diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index ba85be03..0a0ad067 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -176,24 +176,23 @@ class Pickleable(object): #raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy" import copy memo = {} + # the next part makes sure that we do not include parents in any form: parents = [] - self.traverse_parents(parents.append) - # remove self, which is the first arguments - parents = [p for p in parents if p is not self] + self.traverse_parents(parents.append) # collect parents for p in parents: - memo[id(p)] = None - memo[id(self.gradient)] = None - memo[id(self.param_array)] = None - memo[id(self._fixes_)] = None - c = copy.deepcopy(self, memo) + memo[id(p)] = None # set all parents to be None, so they will not be copied + memo[id(self.gradient)] = None # reset the gradient + memo[id(self.param_array)] = None # and param_array + memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent + c = copy.deepcopy(self, memo) # and start the copy c._parent_index_ = None return c def __deepcopy__(self, memo): - s = self.__new__(self.__class__) - memo[id(self)] = s + s = self.__new__(self.__class__) # fresh instance + memo[id(self)] = s # be sure to break all cycles --> self is already done import copy - s.__dict__.update(copy.deepcopy(self.__dict__, memo)) + s.__dict__.update(copy.deepcopy(self.__dict__, memo)) # standard copy return s def __getstate__(self): @@ -580,12 +579,6 @@ class OptimizationHandlable(Constrainable): def __init__(self, name, default_constraint=None, *a, **kw): super(OptimizationHandlable, self).__init__(name, default_constraint=default_constraint, *a, **kw) - def transform(self): - [np.put(self.param_array, ind, c.finv(self.param_array.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__] - - def untransform(self): - [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__] - def _get_params_transformed(self): # transformed parameters (apply transformation rules) p = self.param_array.copy() @@ -606,7 +599,8 @@ class OptimizationHandlable(Constrainable): self.param_array.flat[fixes] = p elif self._has_fixes(): self.param_array.flat[self._fixes_] = p else: self.param_array.flat = p - self.untransform() + [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) + for c, ind in self.constraints.iteritems() if c != __fixed__] self._trigger_params_changed() def _trigger_params_changed(self, trigger_parent=True): @@ -726,7 +720,9 @@ class Parameterizable(OptimizationHandlable): def traverse(self, visit, *args, **kwargs): """ - Traverse the hierarchy performing visit(self, *args, **kwargs) at every node passed by. + Traverse the hierarchy performing visit(self, *args, **kwargs) + at every node passed by downwards. This function includes self! + See "visitor pattern" in literature. This is implemented in pre-order fashion. Example: @@ -745,7 +741,7 @@ class Parameterizable(OptimizationHandlable): def traverse_parents(self, visit, *args, **kwargs): """ - Traverse the hierarchy upwards, visiting all parents and their children. + Traverse the hierarchy upwards, visiting all parents and their children except self. See "visitor pattern" in literature. This is implemented in pre-order fashion. Example: @@ -754,13 +750,20 @@ class Parameterizable(OptimizationHandlable): self.traverse_parents(parents.append) print parents """ - if not self.__visited: - visit(self, *args, **kwargs) + if self.has_parent(): self.__visited = True + self._parent_._traverse_parents(visit, *args, **kwargs) + self.__visited = False + + def _traverse_parents(self, visit, *args, **kwargs): + if not self.__visited: + self.__visited = True + visit(self, *args, **kwargs) if self.has_parent(): - self._parent_.traverse_parents(visit, *args, **kwargs) + self._parent_._traverse_parents(visit, *args, **kwargs) self._parent_.traverse(visit, *args, **kwargs) self.__visited = False + #========================================================================= # Gradient handling #========================================================================= @@ -827,11 +830,10 @@ class Parameterizable(OptimizationHandlable): # raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short()) elif param not in self._parameters_: if param.has_parent(): - parent = param._parent_ - while parent is not None: + def visit(parent, self): if parent is self: raise HierarchyError, "You cannot add a parameter twice into the hierarchy" - parent = parent._parent_ + param.traverse_parents(visit, self) param._parent_.remove_parameter(param) # make sure the size is set if index is None: @@ -875,7 +877,7 @@ class Parameterizable(OptimizationHandlable): :param param: param object to remove from being a parameter of this parameterized object. """ if not param in self._parameters_: - raise RuntimeError, "Parameter {} does not belong to this object, remove parameters directly from their respective parents".format(param._short()) + raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name) start = sum([p.size for p in self._parameters_[:param._parent_index_]]) self._remove_parameter_name(param) diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py index 37dd6e0b..b62f5e45 100644 --- a/GPy/testing/pickle_tests.py +++ b/GPy/testing/pickle_tests.py @@ -132,6 +132,9 @@ class Test(ListDictTestCase): self.assertIsNot(par.full_gradient, pcopy.full_gradient) self.assertTrue(pcopy.checkgrad()) self.assert_(np.any(pcopy.gradient!=0.0)) + pcopy.optimize('bfgs') + par.optimize('bfgs') + np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=.001) with tempfile.TemporaryFile('w+b') as f: par.pickle(f) f.seek(0)