From 5405dbd1bc032e7786d025bf2749310a6378a765 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 29 Apr 2014 15:43:35 +0100 Subject: [PATCH 01/43] Pre-devel-move check in --- GPy/core/symbolic.py | 16 ++++++++-------- GPy/likelihoods/__init__.py | 30 +++++++++++++++--------------- GPy/util/symbolic.py | 11 +++++++++-- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/GPy/core/symbolic.py b/GPy/core/symbolic.py index be1234c0..a2d61911 100644 --- a/GPy/core/symbolic.py +++ b/GPy/core/symbolic.py @@ -107,14 +107,14 @@ class Symbolic_core(): # Do symbolic work to compute derivatives. for key, func in self.expressions.items(): - if func['function'].is_Matrix: - rows = func['function'].shape[0] - cols = func['function'].shape[1] - self.expressions[key]['derivative'] = sym.zeros(rows, cols) - for i in xrange(rows): - for j in xrange(cols): - self.expressions[key]['derivative'][i, j] = extract_derivative(func['function'][i, j], derivative_arguments) - else: + # if func['function'].is_Matrix: + # rows = func['function'].shape[0] + # cols = func['function'].shape[1] + # self.expressions[key]['derivative'] = sym.zeros(rows, cols) + # for i in xrange(rows): + # for j in xrange(cols): + # self.expressions[key]['derivative'][i, j] = extract_derivative(func['function'][i, j], derivative_arguments) + # else: self.expressions[key]['derivative'] = extract_derivative(func['function'], derivative_arguments) def _set_parameters(self, parameters): diff --git a/GPy/likelihoods/__init__.py b/GPy/likelihoods/__init__.py index acf0f72c..369271a1 100644 --- a/GPy/likelihoods/__init__.py +++ b/GPy/likelihoods/__init__.py @@ -6,18 +6,18 @@ from poisson import Poisson from student_t import StudentT from likelihood import Likelihood from mixed_noise import MixedNoise -# TODO need to fix this in a config file. -# TODO need to add the files to the git repo! -#try: - #import sympy as sym - #sympy_available=True -#except ImportError: - #sympy_available=False -#if sympy_available: - ## These are likelihoods that rely on symbolic. - #from symbolic import Symbolic - #from sstudent_t import SstudentT - #from negative_binomial import Negative_binomial - ##from skew_normal import Skew_normal - #from skew_exponential import Skew_exponential - #from null_category import Null_category +#TODO need to fix this in a config file. +#TODO need to add the files to the git repo! +try: + import sympy as sym + sympy_available=True +except ImportError: + sympy_available=False +if sympy_available: + #These are likelihoods that rely on symbolic. + from symbolic import Symbolic + from sstudent_t import SstudentT + from negative_binomial import Negative_binomial + from skew_normal import Skew_normal + from skew_exponential import Skew_exponential +# from null_category import Null_category diff --git a/GPy/util/symbolic.py b/GPy/util/symbolic.py index 40a3f91b..9112c5c0 100644 --- a/GPy/util/symbolic.py +++ b/GPy/util/symbolic.py @@ -2,15 +2,22 @@ import sys import numpy as np import sympy as sym from sympy import Function, S, oo, I, cos, sin, asin, log, erf, pi, exp, sqrt, sign, gamma, polygamma - +from sympy.matrices import Matrix ######################################## ## Try to do some matrix functions: problem, you can't do derivatives ## with respect to matrix functions :-( +class GPySymMatrix(Matrix): + def __init__(self, indices): + Matrix.__init__(self) + def atoms(self): + return [e2 for e in self for e2 in e.atoms()] + class selector(Function): """A function that returns an element of a Matrix depending on input indices.""" nargs = 3 - + def fdiff(self, argindex=1): + return selector(*self.args) @classmethod def eval(cls, X, i, j): if i.is_Number and j.is_Number: From f07bdad24e1715b2ac9a912d34471eedb997c0aa Mon Sep 17 00:00:00 2001 From: James Hensman Date: Wed, 30 Apr 2014 12:11:41 +0100 Subject: [PATCH 02/43] added the ability for GPs to predict with a different kernel --- GPy/core/gp.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/GPy/core/gp.py b/GPy/core/gp.py index 692e5d01..62e16de1 100644 --- a/GPy/core/gp.py +++ b/GPy/core/gp.py @@ -72,7 +72,7 @@ class GP(Model): def log_likelihood(self): return self._log_marginal_likelihood - def _raw_predict(self, _Xnew, full_cov=False): + def _raw_predict(self, _Xnew, full_cov=False, kern=None): """ For making predictions, does not account for normalization or likelihood @@ -87,14 +87,17 @@ class GP(Model): $$ """ - Kx = self.kern.K(_Xnew, self.X).T + if kern is None: + kern = self.kern + + Kx = kern.K(_Xnew, self.X).T WiKx = np.dot(self.posterior.woodbury_inv, Kx) mu = np.dot(Kx.T, self.posterior.woodbury_vector) if full_cov: - Kxx = self.kern.K(_Xnew) + Kxx = kern.K(_Xnew) var = Kxx - np.dot(Kx.T, WiKx) else: - Kxx = self.kern.Kdiag(_Xnew) + Kxx = kern.Kdiag(_Xnew) var = Kxx - np.sum(WiKx*Kx, 0) var = var.reshape(-1, 1) @@ -102,7 +105,7 @@ class GP(Model): if len(mu.shape)==1: mu = mu[:,None] return mu, var - def predict(self, Xnew, full_cov=False, Y_metadata=None): + def predict(self, Xnew, full_cov=False, Y_metadata=None, kern=None): """ Predict the function(s) at the new point(s) Xnew. @@ -111,6 +114,9 @@ class GP(Model): :param full_cov: whether to return the full covariance matrix, or just the diagonal :type full_cov: bool + :param Y_metadata: metadata about the predicting point to pass to the likelihood + :param kern: The kernel to use for prediction (defaults to the model + kern). this is useful for examining e.g. subprocesses. :returns: mean: posterior mean, a Numpy array, Nnew x self.input_dim :returns: var: posterior variance, a Numpy array, Nnew x 1 if full_cov=False, Nnew x Nnew otherwise @@ -121,9 +127,9 @@ class GP(Model): If full_cov and self.input_dim > 1, the return shape of var is Nnew x Nnew x self.input_dim. If self.input_dim == 1, the return shape is Nnew x Nnew. This is to allow for different normalizations of the output dimensions. - """ + """ #predict the latent function values - mu, var = self._raw_predict(Xnew, full_cov=full_cov) + mu, var = self._raw_predict(Xnew, full_cov=full_cov, kern=kern) # now push through likelihood mean, var = self.likelihood.predictive_values(mu, var, full_cov, Y_metadata) From 02f62dea2d5e2b5674c3aacd7c204cb4afb06f5e Mon Sep 17 00:00:00 2001 From: James Hensman Date: Wed, 30 Apr 2014 13:10:01 +0100 Subject: [PATCH 03/43] removed import of non-added file (Mu) --- GPy/kern/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index 4ac91d04..1f58d7c2 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -10,7 +10,7 @@ from _src.independent_outputs import IndependentOutputs, Hierarchical from _src.coregionalize import Coregionalize from _src.ssrbf import SSRBF # TODO: ZD: did you remove this? from _src.ODE_UY import ODE_UY -from _src.ODE_UYC import ODE_UYC +#from _src.ODE_UYC import ODE_UYC ADD THIS FILE TO THE REPO!! from _src.ODE_st import ODE_st # TODO: put this in an init file somewhere #I'm commenting this out because the files were not added. JH. Remember to add the files before commiting From 9c7c768c597b8e0d219484de6f8b2d9c36e5fea2 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Wed, 30 Apr 2014 13:11:39 +0100 Subject: [PATCH 04/43] removed another import of non-added file --- GPy/kern/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index 1f58d7c2..ccf73b97 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -11,7 +11,7 @@ from _src.coregionalize import Coregionalize from _src.ssrbf import SSRBF # TODO: ZD: did you remove this? from _src.ODE_UY import ODE_UY #from _src.ODE_UYC import ODE_UYC ADD THIS FILE TO THE REPO!! -from _src.ODE_st import ODE_st +#from _src.ODE_st import ODE_st # TODO: put this in an init file somewhere #I'm commenting this out because the files were not added. JH. Remember to add the files before commiting try: From 7d41001ae1579f77cd26112d60c2d5a558134aaa Mon Sep 17 00:00:00 2001 From: James Hensman Date: Thu, 1 May 2014 15:28:02 +0100 Subject: [PATCH 05/43] some hacking on image_show in viaualize --- GPy/plotting/matplot_dep/visualize.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py index 89d36a7d..fae05ff3 100644 --- a/GPy/plotting/matplot_dep/visualize.py +++ b/GPy/plotting/matplot_dep/visualize.py @@ -273,7 +273,7 @@ class image_show(matplotlib_show): :type preset_mean: double :param preset_std: the preset standard deviation of a scaled image. :type preset_std: double""" - def __init__(self, vals, axes=None, dimensions=(16,16), transpose=False, order='C', invert=False, scale=False, palette=[], preset_mean = 0., preset_std = -1., select_image=0): + def __init__(self, vals, axes=None, dimensions=(16,16), transpose=False, order='C', invert=False, scale=False, palette=[], preset_mean=0., preset_std=1., select_image=0): matplotlib_show.__init__(self, vals, axes) self.dimensions = dimensions self.transpose = transpose @@ -323,13 +323,12 @@ class image_show(matplotlib_show): self.vals = -self.vals # un-normalizing, for visualisation purposes: - if self.preset_std >= 0: # The Mean is assumed to be in the range (0,255) - self.vals = self.vals*self.preset_std + self.preset_mean - # Clipping the values: - self.vals[self.vals < 0] = 0 - self.vals[self.vals > 255] = 255 - else: - self.vals = 255*(self.vals - self.vals.min())/(self.vals.max() - self.vals.min()) + self.vals = self.vals*self.preset_std + self.preset_mean + # Clipping the values: + #self.vals[self.vals < 0] = 0 + #self.vals[self.vals > 255] = 255 + #else: + #self.vals = 255*(self.vals - self.vals.min())/(self.vals.max() - self.vals.min()) if not self.palette == []: # applying using an image palette (e.g. if the image has been quantized) from PIL import Image self.vals = Image.fromarray(self.vals.astype('uint8')) From 5fac35b0fda9d0a8c6866f5c06dccd9f76bd279a Mon Sep 17 00:00:00 2001 From: James Hensman Date: Tue, 6 May 2014 13:05:26 +0100 Subject: [PATCH 06/43] sparse GPs can now accept kerns for predicting --- GPy/core/sparse_gp.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/GPy/core/sparse_gp.py b/GPy/core/sparse_gp.py index 7552b8ac..b01d39c0 100644 --- a/GPy/core/sparse_gp.py +++ b/GPy/core/sparse_gp.py @@ -79,29 +79,32 @@ class SparseGP(GP): self.Z.gradient = self.kern.gradients_X(self.grad_dict['dL_dKmm'], self.Z) self.Z.gradient += self.kern.gradients_X(self.grad_dict['dL_dKnm'].T, self.Z, self.X) - def _raw_predict(self, Xnew, full_cov=False): + def _raw_predict(self, Xnew, full_cov=False, kern=None): """ Make a prediction for the latent function values """ + + if kern is None: kern = self.kern + if not isinstance(Xnew, VariationalPosterior): - Kx = self.kern.K(self.Z, Xnew) + Kx = kern.K(self.Z, Xnew) mu = np.dot(Kx.T, self.posterior.woodbury_vector) if full_cov: - Kxx = self.kern.K(Xnew) + Kxx = kern.K(Xnew) var = Kxx - np.dot(Kx.T, np.dot(self.posterior.woodbury_inv, Kx)) #var = Kxx[:,:,None] - np.tensordot(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx).T, Kx, [1,0]).swapaxes(1,2) var = var.squeeze() else: - Kxx = self.kern.Kdiag(Xnew) + Kxx = kern.Kdiag(Xnew) var = (Kxx - np.sum(np.dot(np.atleast_3d(self.posterior.woodbury_inv).T, Kx) * Kx[None,:,:], 1)).T else: - Kx = self.kern.psi1(self.Z, Xnew) + Kx = kern.psi1(self.Z, Xnew) mu = np.dot(Kx, self.posterior.woodbury_vector) if full_cov: raise NotImplementedError, "TODO" else: - Kxx = self.kern.psi0(self.Z, Xnew) - psi2 = self.kern.psi2(self.Z, Xnew) + Kxx = kern.psi0(self.Z, Xnew) + psi2 = kern.psi2(self.Z, Xnew) var = Kxx - np.sum(np.sum(psi2 * Kmmi_LmiBLmi[None, :, :], 1), 1) return mu, var From adf6dd8ec661fe74cde68d97dfdb178e85a01c90 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Wed, 7 May 2014 11:03:49 +0100 Subject: [PATCH 07/43] added citation to readme --- README.md | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.md b/README.md index b7635b0d..2c33f0d2 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,16 @@ A Gaussian processes framework in Python. Continuous integration status: ![CI status](https://travis-ci.org/SheffieldML/GPy.png) +Citation +======== + + @Misc{gpy2014, + author = {The GPy authors}, + title = {{GPy}: A Gaussian process framework in python}, + howpublished = {\url{http://github.com/SheffieldML/GPy}}, + year = {2012--2014} + } + Getting started =============== Installing with pip From beebf6933a3681c573ffd8e8b7f978e549e2c959 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Wed, 7 May 2014 14:53:10 +0100 Subject: [PATCH 08/43] added polynomial kernel --- GPy/kern/__init__.py | 1 + GPy/kern/_src/poly.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 GPy/kern/_src/poly.py diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index ccf73b97..1ed5e805 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -10,6 +10,7 @@ from _src.independent_outputs import IndependentOutputs, Hierarchical from _src.coregionalize import Coregionalize from _src.ssrbf import SSRBF # TODO: ZD: did you remove this? from _src.ODE_UY import ODE_UY +from _src.poly import Poly #from _src.ODE_UYC import ODE_UYC ADD THIS FILE TO THE REPO!! #from _src.ODE_st import ODE_st # TODO: put this in an init file somewhere diff --git a/GPy/kern/_src/poly.py b/GPy/kern/_src/poly.py new file mode 100644 index 00000000..d40f805c --- /dev/null +++ b/GPy/kern/_src/poly.py @@ -0,0 +1,42 @@ +# Copyright (c) 2014, James Hensman +# Licensed under the BSD 3-clause license (see LICENSE.txt) + +import numpy as np +from kern import Kern +from ...util.misc import param_to_array +from ...core.parameterization import Param +from ...core.parameterization.transformations import Logexp +class Poly(Kern): + """ + Polynomial kernel + """ + + def __init__(self, input_dim, variance=1., order=3., active_dims=None, name='poly'): + super(Poly, self).__init__(input_dim, active_dims, name) + self.variance = Param('variance', variance, Logexp()) + self.add_parameter(self.variance) + self.order=order + + def K(self, X, X2=None): + return (self._dot_product(X, X2) + 1.)**self.order * self.variance + + def _dot_product(self, X, X2=None): + if X2 is None: + return np.dot(X, X.T) + else: + return np.dot(X, X2.T) + + def Kdiag(self, X): + return self.variance*(np.square(X).sum(1) + 1.)**self.order + + def update_gradients_full(self, dL_dK, X, X2=None): + self.variance.gradient = np.sum(dL_dK * (self._dot_product(X, X2) + 1.)**self.order) + + def update_gradients_diag(self, dL_dKdiag, X): + raise NotImplementedError + + def gradients_X(self, dL_dK, X, X2=None): + raise NotImplementedError + + def gradients_X_diag(self, dL_dKdiag, X): + raise NotImplementedError From 64fb6ddc4cd6c726119c4b7f72ef9b0ef85c3421 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 9 May 2014 09:04:56 +0100 Subject: [PATCH 09/43] [caching] done right --- GPy/util/caching.py | 149 ++++++++++++++++++++++++++------------------ 1 file changed, 87 insertions(+), 62 deletions(-) diff --git a/GPy/util/caching.py b/GPy/util/caching.py index bb162ee3..533d287a 100644 --- a/GPy/util/caching.py +++ b/GPy/util/caching.py @@ -1,84 +1,107 @@ from ..core.parameterization.parameter_core import Observable -import itertools +import itertools, collections, weakref class Cacher(object): - """ - - - """ - def __init__(self, operation, limit=5, ignore_args=(), force_kwargs=()): + """ + Parameters: + *********** + :param callable operation: function to cache + :param int limit: depth of cacher + :param [int] ignore_args: list of indices, pointing at arguments to ignore in *args of operation(*args). This includes self! + :param [str] force_kwargs: list of kwarg names (strings). If a kwarg with that name is given, the cacher will force recompute and wont cache anything. + """ self.limit = int(limit) self.ignore_args = ignore_args self.force_kwargs = force_kwargs self.operation=operation - self.cached_inputs = [] - self.cached_outputs = [] - self.inputs_changed = [] + self.order = collections.deque() + self.cached_inputs = {} # point from cache_ids to a list of [ind_ids], which where used in cache cache_id + + #======================================================================= + # point from each ind_id to [ref(obj), cache_ids] + # 0: a weak reference to the object itself + # 1: the cache_ids in which this ind_id is used (len will be how many times we have seen this ind_id) + self.cached_input_ids = {} + #======================================================================= + + self.cached_outputs = {} # point from cache_ids to outputs + self.inputs_changed = {} # point from cache_ids to bools + + def combine_args_kw(self, args, kw): + "Combines the args and kw in a unique way, such that ordering of kwargs does not lead to recompute" + return args + tuple(c[1] for c in sorted(kw.items(), key=lambda x: x[0])) + + def preprocess(self, combined_args_kw, ignore_args): + "get the cacheid (conc. string of argument ids in order) ignoring ignore_args" + return "".join(str(id(a)) for i,a in enumerate(combined_args_kw) if i not in ignore_args) + + def ensure_cache_length(self, cache_id): + "Ensures the cache is within its limits and has one place free" + if len(self.order) == self.limit: + # we have reached the limit, so lets release one element + cache_id = self.order.popleft() + combined_args_kw = self.cached_inputs[cache_id] + for ind_id in combined_args_kw: + ref, cache_ids = self.cached_input_ids[ind_id] + if len(cache_ids) == 1 and ref() is not None: + ref().remove_observer(self, self.on_cache_changed) + del self.cached_input_ids[ind_id] + else: + cache_ids.remove(cache_id) + self.cached_input_ids[ind_id] = [ref, cache_ids] + del self.cached_outputs[cache_id] + del self.inputs_changed[cache_id] + del self.cached_inputs[cache_id] + + def add_to_cache(self, cache_id, combined_args_kw, output): + self.inputs_changed[cache_id] = False + self.cached_outputs[cache_id] = output + self.order.append(cache_id) + self.cached_inputs[cache_id] = combined_args_kw + for a in combined_args_kw: + ind_id = id(a) + v = self.cached_input_ids.get(ind_id, [weakref.ref(a), []]) + v[1].append(cache_id) + if len(v[1]) == 1: + a.add_observer(self, self.on_cache_changed) + self.cached_input_ids[ind_id] = v def __call__(self, *args, **kw): """ A wrapper function for self.operation, """ - #ensure that specified arguments are ignored - items = sorted(kw.items(), key=lambda x: x[0]) - oa_all = args + tuple(a for _,a in items) - if len(self.ignore_args) != 0: - oa = [a for i,a in itertools.chain(enumerate(args), items) if i not in self.ignore_args and i not in self.force_kwargs] - else: - oa = oa_all - - # this makes sure we only add an observer once, and that None can be in args - observable_args = [] - for a in oa: - if (not any(a is ai for ai in observable_args)) and a is not None: - observable_args.append(a) - - #make sure that all the found argument really are observable: - #otherswise don't cache anything, pass args straight though - if not all([isinstance(arg, Observable) for arg in observable_args]): - return self.operation(*args, **kw) - + # 1: Check whether we have forced recompute arguments: if len(self.force_kwargs) != 0: - # check if there are force args, which force reloading for k in self.force_kwargs: if k in kw and kw[k] is not None: return self.operation(*args, **kw) - # TODO: WARNING !!! Cache OFFSWITCH !!! WARNING - # return self.operation(*args, **kw) - #if the result is cached, return the cached computation - state = [all(a is b for a, b in itertools.izip_longest(args, cached_i)) for cached_i in self.cached_inputs] + # 2: preprocess and get the unique id string for this call + combined_args_kw = self.combine_args_kw(args, kw) + cache_id = self.preprocess(combined_args_kw, self.ignore_args) + + # 2: if anything is not cachable, we will just return the operation, without caching + if reduce(lambda a,b: a or (not isinstance(b, Observable)), combined_args_kw, False): + return self.operation(*args, **kw) + # 3&4: check whether this cache_id has been cached, then has it changed? try: - if any(state): - i = state.index(True) - if self.inputs_changed[i]: - #(elements of) the args have changed since we last computed: update - self.cached_outputs[i] = self.operation(*args, **kw) - self.inputs_changed[i] = False - return self.cached_outputs[i] - else: - #first time we've seen these arguments: compute - - #first make sure the depth limit isn't exceeded - if len(self.cached_inputs) == self.limit: - args_ = self.cached_inputs.pop(0) - args_ = [a for i,a in enumerate(args_) if i not in self.ignore_args and i not in self.force_kwargs] - [a.remove_observer(self, self.on_cache_changed) for a in args_ if a is not None] - self.inputs_changed.pop(0) - self.cached_outputs.pop(0) - #compute - self.cached_inputs.append(oa_all) - self.cached_outputs.append(self.operation(*args, **kw)) - self.inputs_changed.append(False) - [a.add_observer(self, self.on_cache_changed) for a in observable_args] - return self.cached_outputs[-1]#return + if(self.inputs_changed[cache_id]): + # 4: This happens, when one element has changed for this cache id + self.inputs_changed[cache_id] = False + self.cached_outputs[cache_id] = self.operation(*args, **kw) + except KeyError: + # 3: This is when we never saw this chache_id: + self.ensure_cache_length(cache_id) + self.add_to_cache(cache_id, combined_args_kw, self.operation(*args, **kw)) except: self.reset() raise + # 5: We have seen this cache_id and it is cached: + return self.cached_outputs[cache_id] def on_cache_changed(self, direct, which=None): """ @@ -86,17 +109,19 @@ class Cacher(object): this function gets 'hooked up' to the inputs when we cache them, and upon their elements being changed we update here. """ - self.inputs_changed = [any([a is direct or a is which for a in args]) or old_ic for args, old_ic in zip(self.cached_inputs, self.inputs_changed)] + for ind_id in [id(direct), id(which)]: + _, cache_ids = self.cached_input_ids.get(ind_id, [None, []]) + for cache_id in cache_ids: + self.inputs_changed[cache_id] = True def reset(self): """ Totally reset the cache """ - [[a.remove_observer(self, self.on_cache_changed) for a in args if isinstance(a, Observable)] for args in self.cached_inputs] - [[a.remove_observer(self, self.reset) for a in args if isinstance(a, Observable)] for args in self.cached_inputs] - self.cached_inputs = [] - self.cached_outputs = [] - self.inputs_changed = [] + [a().remove_observer(self, self.on_cache_changed) if (a() is not None) else None for a in self.cached_input_ids.values()] + self.cached_input_ids = {} + self.cached_outputs = {} + self.inputs_changed = {} def __deepcopy__(self, memo=None): return Cacher(self.operation, self.limit, self.ignore_args, self.force_kwargs) From b050e1fb6439d0dea845ec960157de7fafe3c799 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 9 May 2014 11:13:33 +0100 Subject: [PATCH 10/43] [caching] id fix --- GPy/util/caching.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GPy/util/caching.py b/GPy/util/caching.py index 533d287a..c8cf7149 100644 --- a/GPy/util/caching.py +++ b/GPy/util/caching.py @@ -44,7 +44,8 @@ class Cacher(object): # we have reached the limit, so lets release one element cache_id = self.order.popleft() combined_args_kw = self.cached_inputs[cache_id] - for ind_id in combined_args_kw: + for ind in combined_args_kw: + ind_id = id(ind) ref, cache_ids = self.cached_input_ids[ind_id] if len(cache_ids) == 1 and ref() is not None: ref().remove_observer(self, self.on_cache_changed) From 289bb762cce334c1c2be3aaa0d519e9211f55dd1 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 9 May 2014 11:14:30 +0100 Subject: [PATCH 11/43] [combination kernel] some fixing with error messages --- GPy/kern/_src/independent_outputs.py | 2 +- GPy/kern/_src/kern.py | 16 ++++++++-------- GPy/testing/kernel_tests.py | 11 ++++++++--- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py index 3493cf4f..12c51ca3 100644 --- a/GPy/kern/_src/independent_outputs.py +++ b/GPy/kern/_src/independent_outputs.py @@ -32,7 +32,7 @@ def index_to_slices(index): [ret[ind_i].append(slice(*indexes_i)) for ind_i,indexes_i in zip(ind[switchpoints[:-1]],zip(switchpoints,switchpoints[1:]))] return ret -class IndependentOutputs(Kern): +class IndependentOutputs(CombinationKernel): """ A kernel which can represent several independent functions. this kernel 'switches off' parts of the matrix where the output indexes are different. diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 70bd42b9..4028e16b 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -45,7 +45,7 @@ class Kern(Parameterized): try: self.input_dim = int(input_dim) self.active_dims = active_dims# if active_dims is not None else slice(0, input_dim, 1) - except TypeError: + except ValueError: # input_dim is something else then an integer self.input_dim = input_dim if active_dims is not None: @@ -202,12 +202,12 @@ class Kern(Parameterized): return Prod([self, other], name) def _check_input_dim(self, X): - assert X.shape[1] == self.input_dim, "You did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(X.shape[1], self.input_dim) - + assert X.shape[1] == self.input_dim, "{} did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(self.name, X.shape[1], self.input_dim) + def _check_active_dims(self, X): assert X.shape[1] >= len(np.r_[self.active_dims]), "At least {} dimensional X needed, X.shape={!s}".format(len(np.r_[self.active_dims]), X.shape) - + class CombinationKernel(Kern): """ Abstract super class for combination kernels. @@ -238,16 +238,16 @@ class CombinationKernel(Kern): def get_input_dim_active_dims(self, kernels, extra_dims = None): #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int)) #active_dims = np.array(np.concatenate((active_dims, extra_dims if extra_dims is not None else [])), dtype=int) - input_dim = np.array([k.input_dim for k in kernels]) - if np.all(input_dim[0]==input_dim): - input_dim = input_dim[0] + input_dim = " ".join(map(lambda k: "{!s}:{!s}".format(k.name, k.input_dim), kernels)) + if extra_dims is not None: + input_dim += " + extra:{!s}".format(extra_dims) active_dims = None return input_dim, active_dims def input_sensitivity(self): raise NotImplementedError("Choose the kernel you want to get the sensitivity for. You need to override the default behaviour for getting the input sensitivity to be able to get the input sensitivity. For sum kernel it is the sum of all sensitivities, TODO: product kernel? Other kernels?, also TODO: shall we return all the sensitivities here in the combination kernel? So we can combine them however we want? This could lead to just plot all the sensitivities here...") - def _check_input_dim(self, X): + def _check_active_dims(self, X): return def _check_input_dim(self, X): diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py index 5bd3f494..8c48d37f 100644 --- a/GPy/testing/kernel_tests.py +++ b/GPy/testing/kernel_tests.py @@ -319,7 +319,7 @@ class KernelTestsMiscellaneous(unittest.TestCase): GPy.kern.Kern(int(np.round((i+1)/j)), slice(0, i+1, j), "testkern") # test the ability to have only one dim sk = GPy.kern.RBF(2) + GPy.kern.Matern32(2) - self.assertEqual(sk.input_dim, 2) + self.assertEqual(sk.input_dim, "rbf:2 Mat32:2 + extra:[]") def test_which_parts(self): self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.matern]), self.linear.K(self.X)+self.matern.K(self.X))) @@ -344,10 +344,15 @@ class KernelTestsNonContinuous(unittest.TestCase): self.X2[(N0*2):, -1] = 1 def test_IndependentOutputs(self): - k = GPy.kern.RBF(self.D) + k = GPy.kern.RBF(self.D, active_dims=range(self.D)) kern = GPy.kern.IndependentOutputs(k, -1, 'ind_single') self.assertTrue(check_kernel_gradient_functions(kern, X=self.X, X2=self.X2, verbose=verbose, fixed_X_dims=-1)) - k = [GPy.kern.RBF(1, active_dims=[1], name='rbf1'), GPy.kern.RBF(self.D, name='rbf012'), GPy.kern.RBF(2, active_dims=[0,2], name='rbf02')] + k = [GPy.kern.RBF(1, active_dims=[1], name='rbf1'), GPy.kern.RBF(self.D, active_dims=range(self.D), name='rbf012'), GPy.kern.RBF(2, active_dims=[0,2], name='rbf02')] + kern = GPy.kern.IndependentOutputs(k, -1, name='ind_split') + self.assertTrue(check_kernel_gradient_functions(kern, X=self.X, X2=self.X2, verbose=verbose, fixed_X_dims=-1)) + + def test_Hierarchical(self): + k = [GPy.kern.RBF(2, active_dims=[0,2], name='rbf1'), GPy.kern.RBF(2, active_dims=[0,2], name='rbf2')] kern = GPy.kern.IndependentOutputs(k, -1, name='ind_split') self.assertTrue(check_kernel_gradient_functions(kern, X=self.X, X2=self.X2, verbose=verbose, fixed_X_dims=-1)) From 675d4987948568d2ad176eee1794c61eefa085f8 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 9 May 2014 14:05:22 +0100 Subject: [PATCH 12/43] [active_dims] all kernels now have int arrays as active_dims --- GPy/kern/_src/add.py | 5 +---- GPy/kern/_src/kern.py | 45 +++++++++++++++---------------------- GPy/testing/kernel_tests.py | 12 +--------- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/GPy/kern/_src/add.py b/GPy/kern/_src/add.py index 88b8e40c..f5b54797 100644 --- a/GPy/kern/_src/add.py +++ b/GPy/kern/_src/add.py @@ -170,7 +170,4 @@ class Add(CombinationKernel): return self def input_sensitivity(self): - in_sen = np.zeros(self.input_dim) - for i, p in enumerate(self.parts): - in_sen[p.active_dims] += p.input_sensitivity() - return in_sen + return reduce(np.add, [k.input_sensitivity() for k in self.parts]) \ No newline at end of file diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 4028e16b..368a9c87 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -34,36 +34,24 @@ class Kern(Parameterized): is the active_dimensions of inputs X we will work on. All kernels will get sliced Xes as inputs, if active_dims is not None + Only positive integers are allowed in active_dims! if active_dims is None, slicing is switched off and all X will be passed through as given. :param int input_dim: the number of input dimensions to the function - :param array-like|slice|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing + :param array-like|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing Do not instantiate. """ super(Kern, self).__init__(name=name, *a, **kw) - try: - self.input_dim = int(input_dim) - self.active_dims = active_dims# if active_dims is not None else slice(0, input_dim, 1) - except ValueError: - # input_dim is something else then an integer - self.input_dim = input_dim - if active_dims is not None: - print "WARNING: given input_dim={} is not an integer and active_dims={} is given, switching off slicing" - self.active_dims = None + self.input_dim = int(input_dim) + + if active_dims is None: + active_dims = np.arange(input_dim) + + self.active_dims = np.array(active_dims, dtype=int) + + assert self.active_dims.size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, self.active_dims.size, self.active_dims) - if self.active_dims is not None and self.input_dim is not None: - assert isinstance(self.active_dims, (slice, list, tuple, np.ndarray)), 'active_dims needs to be an array-like or slice object over dimensions, {} given'.format(self.active_dims.__class__) - if isinstance(self.active_dims, slice): - self.active_dims = slice(self.active_dims.start or 0, self.active_dims.stop or self.input_dim, self.active_dims.step or 1) - active_dim_size = int(np.round((self.active_dims.stop-self.active_dims.start)/self.active_dims.step)) - elif isinstance(self.active_dims, np.ndarray): - #assert np.all(self.active_dims >= 0), 'active dimensions need to be positive. negative indexing is not allowed' - assert self.active_dims.ndim == 1, 'only flat indices allowed, given active_dims.shape={}, provide only indexes to the dimensions (columns) of the input'.format(self.active_dims.shape) - active_dim_size = self.active_dims.size - else: - active_dim_size = len(self.active_dims) - assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims) self._sliced_X = 0 self.useGPU = self._support_GPU and useGPU @@ -205,7 +193,7 @@ class Kern(Parameterized): assert X.shape[1] == self.input_dim, "{} did not specify active_dims and X has wrong shape: X_dim={}, whereas input_dim={}".format(self.name, X.shape[1], self.input_dim) def _check_active_dims(self, X): - assert X.shape[1] >= len(np.r_[self.active_dims]), "At least {} dimensional X needed, X.shape={!s}".format(len(np.r_[self.active_dims]), X.shape) + assert X.shape[1] >= len(self.active_dims), "At least {} dimensional X needed, X.shape={!s}".format(len(self.active_dims), X.shape) class CombinationKernel(Kern): @@ -222,9 +210,10 @@ class CombinationKernel(Kern): :param list kernels: List of kernels to combine (can be only one element) :param str name: name of the combination kernel - :param array-like|slice extra_dims: if needed extra dimensions for the combination kernel to work on + :param array-like extra_dims: if needed extra dimensions for the combination kernel to work on """ assert all([isinstance(k, Kern) for k in kernels]) + extra_dims = np.array(extra_dims, dtype=int) input_dim, active_dims = self.get_input_dim_active_dims(kernels, extra_dims) # initialize the kernel with the full input_dim super(CombinationKernel, self).__init__(input_dim, active_dims, name) @@ -238,10 +227,12 @@ class CombinationKernel(Kern): def get_input_dim_active_dims(self, kernels, extra_dims = None): #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int)) #active_dims = np.array(np.concatenate((active_dims, extra_dims if extra_dims is not None else [])), dtype=int) - input_dim = " ".join(map(lambda k: "{!s}:{!s}".format(k.name, k.input_dim), kernels)) + input_dim = reduce(max, (k.active_dims.max() for k in kernels)) + 1 + if extra_dims is not None: - input_dim += " + extra:{!s}".format(extra_dims) - active_dims = None + input_dim += extra_dims.size + + active_dims = np.arange(input_dim) return input_dim, active_dims def input_sensitivity(self): diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py index 8c48d37f..a942dc49 100644 --- a/GPy/testing/kernel_tests.py +++ b/GPy/testing/kernel_tests.py @@ -304,23 +304,13 @@ class KernelTestsMiscellaneous(unittest.TestCase): def setUp(self): N, D = 100, 10 self.X = np.linspace(-np.pi, +np.pi, N)[:,None] * np.random.uniform(-10,10,D) - self.rbf = GPy.kern.RBF(2, active_dims=slice(0,4,2)) + self.rbf = GPy.kern.RBF(2, active_dims=np.arange(0,4,2)) self.linear = GPy.kern.Linear(2, active_dims=(3,9)) self.matern = GPy.kern.Matern32(3, active_dims=np.array([1,7,9])) self.sumkern = self.rbf + self.linear self.sumkern += self.matern self.sumkern.randomize() - def test_active_dims(self): - # test the automatic dim detection expression for slices: - start, stop = 0, 277 - for i in range(start,stop,7): - for j in range(1,4): - GPy.kern.Kern(int(np.round((i+1)/j)), slice(0, i+1, j), "testkern") - # test the ability to have only one dim - sk = GPy.kern.RBF(2) + GPy.kern.Matern32(2) - self.assertEqual(sk.input_dim, "rbf:2 Mat32:2 + extra:[]") - def test_which_parts(self): self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.matern]), self.linear.K(self.X)+self.matern.K(self.X))) self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.rbf]), self.linear.K(self.X)+self.rbf.K(self.X))) From a717c03866613ae2f824fcdd638f9365ed3dffe4 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 9 May 2014 14:05:57 +0100 Subject: [PATCH 13/43] [datasets] added singlecell dataset --- GPy/util/data_resources.json | 410 +-------------------- GPy/util/datasets.py | 14 +- GPy/util/datasets/data_resources_create.py | 8 + 3 files changed, 22 insertions(+), 410 deletions(-) diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 57b79f10..a4a82edd 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -1,409 +1 @@ -{ - "rogers_girolami_data":{ - "files":[ - [ - "firstcoursemldata.tar.gz" - ] - ], - "license":null, - "citation":"A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", - "details":"Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", - "urls":[ - "https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/" - ], - "suffices":[ - [ - "?dl=1" - ] - ], - "size":21949154 - }, - "ankur_pose_data":{ - "files":[ - [ - "ankurDataPoseSilhouette.mat" - ] - ], - "citation":"3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", - "license":null, - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/" - ], - "details":"Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.", - "size":1 - }, - "football_data":{ - "files":[ - [ - "E0.csv", "E1.csv", "E2.csv", "E3.csv" - ] - ], - "citation":"", - "license":null, - "urls":[ - "http://www.football-data.co.uk/mmz4281/" - ], - "details":"Results of English football matches since 1993/94 season.", - "size":1 - }, - "google_trends":{ - "files":[ - [ - ] - ], - "citation":"", - "license":null, - "urls":[ - "http://www.google.com/trends/" - ], - "details":"Google trends results.", - "size":0 - }, - "osu_accad":{ - "files":[ - [ - "swagger1TXT.ZIP", - "handspring1TXT.ZIP", - "quickwalkTXT.ZIP", - "run1TXT.ZIP", - "sprintTXT.ZIP", - "dogwalkTXT.ZIP", - "camper_04TXT.ZIP", - "dance_KB3_TXT.ZIP", - "per20_TXT.ZIP", - "perTWO07_TXT.ZIP", - "perTWO13_TXT.ZIP", - "perTWO14_TXT.ZIP", - "perTWO15_TXT.ZIP", - "perTWO16_TXT.ZIP" - ], - [ - "connections.txt" - ] - ], - "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", - "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", - "details":"Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", - "urls":[ - "http://accad.osu.edu/research/mocap/data/", - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" - ], - "size":15922790 - }, - "isomap_face_data":{ - "files":[ - [ - "face_data.mat" - ] - ], - "license":null, - "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", - "details":"Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/" - ], - "size":24229368 - }, - "boston_housing":{ - "files":[ - [ - "Index", - "housing.data", - "housing.names" - ] - ], - "license":null, - "citation":"Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", - "details":"The Boston Housing data relates house values in Boston to a range of input variables.", - "urls":[ - "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/" - ], - "size":51276 - }, - "cmu_mocap_full":{ - "files":[ - [ - "allasfamc.zip" - ] - ], - "license":"From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", - "citation":"Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.", - "details":"CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", - "urls":[ - "http://mocap.cs.cmu.edu/subjects" - ], - "size":null - }, - "brendan_faces":{ - "files":[ - [ - "frey_rawface.mat" - ] - ], - "license":null, - "citation":"Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", - "details":"A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", - "urls":[ - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":1100584 - }, - "olympic_marathon_men":{ - "files":[ - [ - "olympicMarathonTimes.csv" - ] - ], - "license":null, - "citation":null, - "details":"Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/" - ], - "size":584 - }, - "pumadyn-32nm":{ - "files":[ - [ - "pumadyn-32nm.tar.gz" - ] - ], - "license":"Data is made available by the Delve system at the University of Toronto", - "citation":"Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", - "details":"Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", - "urls":[ - "ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/" - ], - "size":5861646 - }, - "ripley_prnn_data":{ - "files":[ - [ - "Cushings.dat", - "README", - "crabs.dat", - "fglass.dat", - "fglass.grp", - "pima.te", - "pima.tr", - "pima.tr2", - "synth.te", - "synth.tr", - "viruses.dat", - "virus3.dat" - ] - ], - "license":null, - "citation":"Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", - "details":"Data sets from Brian Ripley's Pattern Recognition and Neural Networks", - "urls":[ - "http://www.stats.ox.ac.uk/pub/PRNN/" - ], - "size":93565 - }, - "three_phase_oil_flow":{ - "files":[ - [ - "DataTrnLbls.txt", - "DataTrn.txt", - "DataTst.txt", - "DataTstLbls.txt", - "DataVdn.txt", - "DataVdnLbls.txt" - ] - ], - "license":null, - "citation":"Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", - "details":"The three phase oil data used initially for demonstrating the Generative Topographic mapping.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/" - ], - "size":712796 - }, - "robot_wireless":{ - "files":[ - [ - "uw-floor.txt" - ] - ], - "license":null, - "citation":"WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", - "details":"Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/" - ], - "size":284390 - }, - "xw_pen":{ - "files":[ - [ - "xw_pen_15.csv" - ] - ], - "license":null, - "citation":"Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", - "details":"Accelerometer pen data used for robust regression by Tipping and Lawrence.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/" - ], - "size":3410 - }, - "swiss_roll":{ - "files":[ - [ - "swiss_roll_data.mat" - ] - ], - "license":null, - "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", - "details":"Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", - "urls":[ - "http://isomap.stanford.edu/" - ], - "size":800256 - }, - "osu_run1":{ - "files":[ - [ - "run1TXT.ZIP" - ], - [ - "connections.txt" - ] - ], - "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", - "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", - "details":"Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", - "urls":[ - "http://accad.osu.edu/research/mocap/data/", - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" - ], - "size":338103 - }, - "creep_rupture":{ - "files":[ - [ - "creeprupt.tar" - ] - ], - "license":null, - "citation":"Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", - "details":"Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", - "urls":[ - "http://www.msm.cam.ac.uk/map/data/tar/" - ], - "size":602797 - }, - "olivetti_faces":{ - "files":[ - [ - "att_faces.zip" - ], - [ - "olivettifaces.mat" - ] - ], - "license":null, - "citation":"Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", - "details":"Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":8561331 - }, - "olivetti_glasses":{ - "files":[ - [ - "has_glasses.np" - ], - [ - "olivettifaces.mat" - ] - ], - "license":null, - "citation":"Information recorded in olivetti_faces entry. Should be used from there.", - "details":"Information recorded in olivetti_faces entry. Should be used from there.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":4261047 - }, - "della_gatta":{ - "files":[ - [ - "DellaGattadata.mat" - ] - ], - "license":null, - "citation":"Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", - "details":"The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/" - ], - "size":3729650 - }, - "epomeo_gpx":{ - "files":[ - [ - "endomondo_1.gpx", - "endomondo_2.gpx", - "garmin_watch_via_endomondo.gpx", - "viewranger_phone.gpx", - "viewranger_tablet.gpx" - ] - ], - "license":null, - "citation":"", - "details":"Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/" - ], - "size":2031872 - }, - "mauna_loa":{ - "files":[ - [ - "co2_mm_mlo.txt" - ] - ], - "license":"-------------------------------------------------------------------- USE OF NOAA ESRL DATA\n\n These data are made freely available to the public and the scientific community in the belief that their wide dissemination will lead to greater understanding and new scientific insights. The availability of these data does not constitute publication of the data. NOAA relies on the ethics and integrity of the user to insure that ESRL receives fair credit for their work. If the data are obtained for potential use in a publication or presentation, ESRL should be informed at the outset of the nature of this work. If the ESRL data are essential to the work, or if an important result or conclusion depends on the ESRL data, co-authorship may be appropriate. This should be discussed at an early stage in the work. Manuscripts using the ESRL data should be sent to ESRL for review before they are submitted for publication so we can insure that the quality and limitations of the data are accurately represented.\n\n Contact: Pieter Tans (303 497 6678; pieter.tans@noaa.gov)\n\n RECIPROCITY Use of these data implies an agreement to reciprocate. Laboratories making similar measurements agree to make their own data available to the general public and to the scientific community in an equally complete and easily accessible form. Modelers are encouraged to make available to the community, upon request, their own tools used in the interpretation of the ESRL data, namely well documented model code, transport fields, and additional information necessary for other scientists to repeat the work and to run modified versions. Model availability includes collaborative support for new users of the models.\n --------------------------------------------------------------------\n\n See www.esrl.noaa.gov/gmd/ccgg/trends/ for additional details.", - "citation":"Mauna Loa Data. Dr. Pieter Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/) and Dr. Ralph Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/).", - "details":"The 'average' column contains the monthly mean CO2 mole fraction determined from daily averages. The mole fraction of CO2, expressed as parts per million (ppm) is the number of molecules of CO2 in every one million molecules of dried air (water vapor removed). If there are missing days concentrated either early or late in the month, the monthly mean is corrected to the middle of the month using the average seasonal cycle. Missing months are denoted by -99.99. The 'interpolated' column includes average values from the preceding column and interpolated values where data are missing. Interpolated values are computed in two steps. First, we compute for each month the average seasonal cycle in a 7-year window around each monthly value. In this way the seasonal cycle is allowed to change slowly over time. We then determine the 'trend' value for each month by removing the seasonal cycle; this result is shown in the 'trend' column. Trend values are linearly interpolated for missing months. The interpolated monthly mean is then the sum of the average seasonal cycle value and the trend value for the missing month.\n\nNOTE: In general, the data presented for the last year are subject to change, depending on recalibration of the reference gas mixtures used, and other quality control procedures. Occasionally, earlier years may also be changed for the same reasons. Usually these changes are minor.\n\nCO2 expressed as a mole fraction in dry air, micromol/mol, abbreviated as ppm \n\n (-99.99 missing data; -1 no data for daily means in month)", - "urls":[ - "ftp://aftp.cmdl.noaa.gov/products/trends/co2/" - ], - "size":46779 - }, - "boxjenkins_airline":{ - "files":[ - [ - "boxjenkins_airline.csv" - ] - ], - "license":"You may copy and redistribute the data. You may make derivative works from the data. You may use the data for commercial purposes. You may not sublicence the data when redistributing it. You may not redistribute the data under a different license. Source attribution on any use of this data: Must refer source.", - "citation":"Box & Jenkins (1976), in file: data/airpass, Description: International airline passengers: monthly totals in thousands. Jan 49 – Dec 60", - "details":"International airline passengers, monthly totals from January 1949 to December 1960.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/boxjenkins_airline/" - ], - "size":46779 - }, - - "decampos_characters":{ - "files":[ - [ - "characters.npy", - "digits.npy" - ] - ], - "license":null, - "citation":"T. de Campos, B. R. Babu, and M. Varma. Character recognition in natural images. VISAPP 2009.", - "details":"Examples of hand written digits taken from the de Campos et al paper on Character Recognition in Natural Images.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/decampos_digits/" - ], - "size":2031872 - } -} +{"rogers_girolami_data": {"files": [["firstcoursemldata.tar.gz"]], "license": null, "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", "urls": ["https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/"], "suffices": [["?dl=1"]], "size": 21949154}, "ankur_pose_data": {"files": [["ankurDataPoseSilhouette.mat"]], "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", "license": null, "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/"], "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."}, "osu_accad": {"files": [["swagger1TXT.ZIP", "handspring1TXT.ZIP", "quickwalkTXT.ZIP", "run1TXT.ZIP", "sprintTXT.ZIP", "dogwalkTXT.ZIP", "camper_04TXT.ZIP", "dance_KB3_TXT.ZIP", "per20_TXT.ZIP", "perTWO07_TXT.ZIP", "perTWO13_TXT.ZIP", "perTWO14_TXT.ZIP", "perTWO15_TXT.ZIP", "perTWO16_TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 15922790}, "isomap_face_data": {"files": [["face_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/"], "size": 24229368}, "boston_housing": {"files": [["Index", "housing.data", "housing.names"]], "license": null, "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", "details": "The Boston Housing data relates house values in Boston to a range of input variables.", "urls": ["http://archive.ics.uci.edu/ml/machine-learning-databases/housing/"], "size": 51276}, "cmu_mocap_full": {"files": [["allasfamc.zip"]], "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'\n 'The database was created with funding from NSF EIA-0196217.", "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", "urls": ["http://mocap.cs.cmu.edu"], "size": null}, "brendan_faces": {"files": [["frey_rawface.mat"]], "license": null, "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", "urls": ["http://www.cs.nyu.edu/~roweis/data/"], "size": 1100584}, "singlecell": {"files": [["singlecell.csv"]], "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", "urls": ["http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/"], "size": 233.1}, "olympic_marathon_men": {"files": [["olympicMarathonTimes.csv"]], "license": null, "citation": null, "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/"], "size": 584}, "pumadyn-32nm": {"files": [["pumadyn-32nm.tar.gz"]], "license": "Data is made available by the Delve system at the University of Toronto", "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", "urls": ["ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/"], "size": 5861646}, "ripley_prnn_data": {"files": [["Cushings.dat", "README", "crabs.dat", "fglass.dat", "fglass.grp", "pima.te", "pima.tr", "pima.tr2", "synth.te", "synth.tr", "viruses.dat", "virus3.dat"]], "license": null, "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", "urls": ["http://www.stats.ox.ac.uk/pub/PRNN/"], "size": 93565}, "three_phase_oil_flow": {"files": [["DataTrnLbls.txt", "DataTrn.txt", "DataTst.txt", "DataTstLbls.txt", "DataVdn.txt", "DataVdnLbls.txt"]], "license": null, "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/"], "size": 712796}, "robot_wireless": {"files": [["uw-floor.txt"]], "license": null, "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/"], "size": 284390}, "xw_pen": {"files": [["xw_pen_15.csv"]], "license": null, "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/"], "size": 3410}, "swiss_roll": {"files": [["swiss_roll_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://isomap.stanford.edu/"], "size": 800256}, "osu_run1": {"files": [["run1TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 338103}, "creep_rupture": {"files": [["creeprupt.tar"]], "license": null, "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", "urls": ["http://www.msm.cam.ac.uk/map/data/tar/"], "size": 602797}, "hapmap3": {"files": [["hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", "relationships_w_pops_121708.txt"]], "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", "citation": "Gibbs, Richard A., et al. \"The international HapMap project.\" Nature 426.6968 (2003): 789-796.", "details": "\n HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", "urls": ["http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/"], "size": 3458246739}, "olivetti_faces": {"files": [["att_faces.zip"], ["olivettifaces.mat"]], "license": null, "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", "http://www.cs.nyu.edu/~roweis/data/"], "size": 8561331}, "della_gatta": {"files": [["DellaGattadata.mat"]], "license": null, "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/"], "size": 3729650}, "epomeo_gpx": {"files": [["endomondo_1.gpx", "endomondo_2.gpx", "garmin_watch_via_endomondo.gpx", "viewranger_phone.gpx", "viewranger_tablet.gpx"]], "license": null, "citation": "", "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/"], "size": 2031872}} \ No newline at end of file diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 3f42055b..02c5cdb9 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -717,7 +717,19 @@ def hapmap3(data_set='hapmap3'): inandf=inandf, populations=populations) return hapmap - + +def singlecell(data_set='singlecell'): + if not data_available(data_set): + download_data(data_set) + dirpath = os.path.join(data_path, data_set) + data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) + genes = data[0, 1:] + labels = data[1:, 0] + Y = np.array(data[1:, 1:], dtype=float) + return data_details_return({'Y': Y, 'info' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + 'genes':genes, 'labels':labels, + }, data_set) + def swiss_roll_1000(): return swiss_roll(num_samples=1000) diff --git a/GPy/util/datasets/data_resources_create.py b/GPy/util/datasets/data_resources_create.py index 4e7c3524..919e3ea4 100644 --- a/GPy/util/datasets/data_resources_create.py +++ b/GPy/util/datasets/data_resources_create.py @@ -161,6 +161,14 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'], 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""", 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""", 'size' : 2*1729092237 + 62265}, + + 'singlecell' : {'urls' : ["http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/"], + 'files' : [['singlecell.csv']], + 'details' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + 'citation' : "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", + 'license' : "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", + 'size' : 233.1, + } } with open('data_resources.json', 'w') as f: From 457e00f05894b6bafe9e995fda59590481161239 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 9 May 2014 14:06:41 +0100 Subject: [PATCH 14/43] [caching] when reset --- GPy/util/caching.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPy/util/caching.py b/GPy/util/caching.py index c8cf7149..583cb26f 100644 --- a/GPy/util/caching.py +++ b/GPy/util/caching.py @@ -119,7 +119,7 @@ class Cacher(object): """ Totally reset the cache """ - [a().remove_observer(self, self.on_cache_changed) if (a() is not None) else None for a in self.cached_input_ids.values()] + [a().remove_observer(self, self.on_cache_changed) if (a() is not None) else None for [a, _] in self.cached_input_ids.values()] self.cached_input_ids = {} self.cached_outputs = {} self.inputs_changed = {} From 4eac8a59655738b05cbc716a0834d3d94dfddd8c Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 9 May 2014 14:07:34 +0100 Subject: [PATCH 15/43] [param concatenation] allows assignmend more robustly --- GPy/core/parameterization/param.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index c22ad3c5..7055838a 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -316,8 +316,8 @@ class ParamConcatenation(object): val = val.values() ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; vals = self.values(); vals[s] = val - [numpy.copyto(p, vals[ps], where=ind[ps]) - for p, ps in zip(self.params, self._param_slices_)] + for p, ps in zip(self.params, self._param_slices_): + p.flat[ind[ps]] = vals[ps] if update: self.update_all_params() def values(self): From 8f54449a4ec6d92e6c7317d22cf1d66b3a224c1b Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 9 May 2014 14:07:42 +0100 Subject: [PATCH 16/43] whitespaces --- GPy/plotting/matplot_dep/dim_reduction_plots.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py index 57d932cc..ca2c890f 100644 --- a/GPy/plotting/matplot_dep/dim_reduction_plots.py +++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py @@ -121,7 +121,7 @@ def plot_latent(model, labels=None, which_indices=None, if plot_inducing: Z = param_to_array(model.Z) ax.plot(Z[:, input_1], Z[:, input_2], '^w') - + ax.set_xlim((xmin, xmax)) ax.set_ylim((ymin, ymax)) @@ -132,7 +132,7 @@ def plot_latent(model, labels=None, which_indices=None, except Exception as e: print "Could not invoke tight layout: {}".format(e) pass - + if updates: try: ax.figure.canvas.show() From 1c1c6008a96a209612e33e9b05cc3b1db97fd7ff Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 09:12:24 +0100 Subject: [PATCH 17/43] [data] data_resources edited, such that json file is edited directly --- GPy/util/data_resources.json | 441 ++++++++++++++++++++++++++++++++++- 1 file changed, 440 insertions(+), 1 deletion(-) diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index a4a82edd..8f5f06bc 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -1 +1,440 @@ -{"rogers_girolami_data": {"files": [["firstcoursemldata.tar.gz"]], "license": null, "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", "urls": ["https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/"], "suffices": [["?dl=1"]], "size": 21949154}, "ankur_pose_data": {"files": [["ankurDataPoseSilhouette.mat"]], "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", "license": null, "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/"], "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."}, "osu_accad": {"files": [["swagger1TXT.ZIP", "handspring1TXT.ZIP", "quickwalkTXT.ZIP", "run1TXT.ZIP", "sprintTXT.ZIP", "dogwalkTXT.ZIP", "camper_04TXT.ZIP", "dance_KB3_TXT.ZIP", "per20_TXT.ZIP", "perTWO07_TXT.ZIP", "perTWO13_TXT.ZIP", "perTWO14_TXT.ZIP", "perTWO15_TXT.ZIP", "perTWO16_TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 15922790}, "isomap_face_data": {"files": [["face_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/"], "size": 24229368}, "boston_housing": {"files": [["Index", "housing.data", "housing.names"]], "license": null, "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", "details": "The Boston Housing data relates house values in Boston to a range of input variables.", "urls": ["http://archive.ics.uci.edu/ml/machine-learning-databases/housing/"], "size": 51276}, "cmu_mocap_full": {"files": [["allasfamc.zip"]], "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'\n 'The database was created with funding from NSF EIA-0196217.", "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", "urls": ["http://mocap.cs.cmu.edu"], "size": null}, "brendan_faces": {"files": [["frey_rawface.mat"]], "license": null, "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", "urls": ["http://www.cs.nyu.edu/~roweis/data/"], "size": 1100584}, "singlecell": {"files": [["singlecell.csv"]], "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", "urls": ["http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/"], "size": 233.1}, "olympic_marathon_men": {"files": [["olympicMarathonTimes.csv"]], "license": null, "citation": null, "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/"], "size": 584}, "pumadyn-32nm": {"files": [["pumadyn-32nm.tar.gz"]], "license": "Data is made available by the Delve system at the University of Toronto", "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", "urls": ["ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/"], "size": 5861646}, "ripley_prnn_data": {"files": [["Cushings.dat", "README", "crabs.dat", "fglass.dat", "fglass.grp", "pima.te", "pima.tr", "pima.tr2", "synth.te", "synth.tr", "viruses.dat", "virus3.dat"]], "license": null, "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", "urls": ["http://www.stats.ox.ac.uk/pub/PRNN/"], "size": 93565}, "three_phase_oil_flow": {"files": [["DataTrnLbls.txt", "DataTrn.txt", "DataTst.txt", "DataTstLbls.txt", "DataVdn.txt", "DataVdnLbls.txt"]], "license": null, "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/"], "size": 712796}, "robot_wireless": {"files": [["uw-floor.txt"]], "license": null, "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/"], "size": 284390}, "xw_pen": {"files": [["xw_pen_15.csv"]], "license": null, "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/"], "size": 3410}, "swiss_roll": {"files": [["swiss_roll_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://isomap.stanford.edu/"], "size": 800256}, "osu_run1": {"files": [["run1TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 338103}, "creep_rupture": {"files": [["creeprupt.tar"]], "license": null, "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", "urls": ["http://www.msm.cam.ac.uk/map/data/tar/"], "size": 602797}, "hapmap3": {"files": [["hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", "relationships_w_pops_121708.txt"]], "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", "citation": "Gibbs, Richard A., et al. \"The international HapMap project.\" Nature 426.6968 (2003): 789-796.", "details": "\n HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", "urls": ["http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/"], "size": 3458246739}, "olivetti_faces": {"files": [["att_faces.zip"], ["olivettifaces.mat"]], "license": null, "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", "http://www.cs.nyu.edu/~roweis/data/"], "size": 8561331}, "della_gatta": {"files": [["DellaGattadata.mat"]], "license": null, "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/"], "size": 3729650}, "epomeo_gpx": {"files": [["endomondo_1.gpx", "endomondo_2.gpx", "garmin_watch_via_endomondo.gpx", "viewranger_phone.gpx", "viewranger_tablet.gpx"]], "license": null, "citation": "", "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/"], "size": 2031872}} \ No newline at end of file +{ + "olivetti_glasses": { + "files": [ + [ + "has_glasses.np" + ], + [ + "olivettifaces.mat" + ] + ], + "license": null, + "citation": "Information recorded in olivetti_faces entry. Should be used from there.", + "details": "Information recorded in olivetti_faces entry. Should be used from there.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", + "http://www.cs.nyu.edu/~roweis/data/" + ], + "size": 4261047 + }, + "boston_housing": { + "files": [ + [ + "Index", + "housing.data", + "housing.names" + ] + ], + "license": null, + "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", + "details": "The Boston Housing data relates house values in Boston to a range of input variables.", + "urls": [ + "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/" + ], + "size": 51276 + }, + "google_trends": { + "files": [ + [] + ], + "license": null, + "citation": "", + "details": "Google trends results.", + "urls": [ + "http://www.google.com/trends/" + ], + "size": 0 + }, + "mauna_loa": { + "files": [ + [ + "co2_mm_mlo.txt" + ] + ], + "license": "-------------------------------------------------------------------- USE OF NOAA ESRL DATA\n\n These data are made freely available to the public and the scientific community in the belief that their wide dissemination will lead to greater understanding and new scientific insights. The availability of these data does not constitute publication of the data. NOAA relies on the ethics and integrity of the user to insure that ESRL receives fair credit for their work. If the data are obtained for potential use in a publication or presentation, ESRL should be informed at the outset of the nature of this work. If the ESRL data are essential to the work, or if an important result or conclusion depends on the ESRL data, co-authorship may be appropriate. This should be discussed at an early stage in the work. Manuscripts using the ESRL data should be sent to ESRL for review before they are submitted for publication so we can insure that the quality and limitations of the data are accurately represented.\n\n Contact: Pieter Tans (303 497 6678; pieter.tans@noaa.gov)\n\n RECIPROCITY Use of these data implies an agreement to reciprocate. Laboratories making similar measurements agree to make their own data available to the general public and to the scientific community in an equally complete and easily accessible form. Modelers are encouraged to make available to the community, upon request, their own tools used in the interpretation of the ESRL data, namely well documented model code, transport fields, and additional information necessary for other scientists to repeat the work and to run modified versions. Model availability includes collaborative support for new users of the models.\n --------------------------------------------------------------------\n\n See www.esrl.noaa.gov/gmd/ccgg/trends/ for additional details.", + "citation": "Mauna Loa Data. Dr. Pieter Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/) and Dr. Ralph Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/).", + "details": "The 'average' column contains the monthly mean CO2 mole fraction determined from daily averages. The mole fraction of CO2, expressed as parts per million (ppm) is the number of molecules of CO2 in every one million molecules of dried air (water vapor removed). If there are missing days concentrated either early or late in the month, the monthly mean is corrected to the middle of the month using the average seasonal cycle. Missing months are denoted by -99.99. The 'interpolated' column includes average values from the preceding column and interpolated values where data are missing. Interpolated values are computed in two steps. First, we compute for each month the average seasonal cycle in a 7-year window around each monthly value. In this way the seasonal cycle is allowed to change slowly over time. We then determine the 'trend' value for each month by removing the seasonal cycle; this result is shown in the 'trend' column. Trend values are linearly interpolated for missing months. The interpolated monthly mean is then the sum of the average seasonal cycle value and the trend value for the missing month.\n\nNOTE: In general, the data presented for the last year are subject to change, depending on recalibration of the reference gas mixtures used, and other quality control procedures. Occasionally, earlier years may also be changed for the same reasons. Usually these changes are minor.\n\nCO2 expressed as a mole fraction in dry air, micromol/mol, abbreviated as ppm \n\n (-99.99 missing data; -1 no data for daily means in month)", + "urls": [ + "ftp://aftp.cmdl.noaa.gov/products/trends/co2/" + ], + "size": 46779 + }, + "osu_run1": { + "files": [ + [ + "run1TXT.ZIP" + ], + [ + "connections.txt" + ] + ], + "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", + "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", + "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", + "urls": [ + "http://accad.osu.edu/research/mocap/data/", + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" + ], + "size": 338103 + }, + "swiss_roll": { + "files": [ + [ + "swiss_roll_data.mat" + ] + ], + "license": null, + "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", + "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", + "urls": [ + "http://isomap.stanford.edu/" + ], + "size": 800256 + }, + "ripley_prnn_data": { + "files": [ + [ + "Cushings.dat", + "README", + "crabs.dat", + "fglass.dat", + "fglass.grp", + "pima.te", + "pima.tr", + "pima.tr2", + "synth.te", + "synth.tr", + "viruses.dat", + "virus3.dat" + ] + ], + "license": null, + "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", + "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", + "urls": [ + "http://www.stats.ox.ac.uk/pub/PRNN/" + ], + "size": 93565 + }, + "rogers_girolami_data": { + "files": [ + [ + "firstcoursemldata.tar.gz" + ] + ], + "license": null, + "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", + "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", + "urls": [ + "https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/" + ], + "suffices": [ + [ + "?dl=1" + ] + ], + "size": 21949154 + }, + "singlecell": { + "files": [ + [ + "singlecell.csv" + ] + ], + "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", + "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", + "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + "urls": [ + "http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/" + ], + "size": 233.1 + }, + "della_gatta": { + "files": [ + [ + "DellaGattadata.mat" + ] + ], + "license": null, + "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", + "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/" + ], + "size": 3729650 + }, + "creep_rupture": { + "files": [ + [ + "creeprupt.tar" + ] + ], + "license": null, + "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", + "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", + "urls": [ + "http://www.msm.cam.ac.uk/map/data/tar/" + ], + "size": 602797 + }, + "olivetti_faces": { + "files": [ + [ + "att_faces.zip" + ], + [ + "olivettifaces.mat" + ] + ], + "license": null, + "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", + "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", + "http://www.cs.nyu.edu/~roweis/data/" + ], + "size": 8561331 + }, + "robot_wireless": { + "files": [ + [ + "uw-floor.txt" + ] + ], + "license": null, + "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", + "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/" + ], + "size": 284390 + }, + "cmu_mocap_full": { + "files": [ + [ + "allasfamc.zip" + ] + ], + "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", + "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.", + "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", + "urls": [ + "http://mocap.cs.cmu.edu/subjects" + ], + "size": null + }, + "football_data": { + "files": [ + [ + "E0.csv", + "E1.csv", + "E2.csv", + "E3.csv" + ] + ], + "license": null, + "citation": "", + "details": "Results of English football matches since 1993/94 season.", + "urls": [ + "http://www.football-data.co.uk/mmz4281/" + ], + "size": 1 + }, + "decampos_characters": { + "files": [ + [ + "characters.npy", + "digits.npy" + ] + ], + "license": null, + "citation": "T. de Campos, B. R. Babu, and M. Varma. Character recognition in natural images. VISAPP 2009.", + "details": "Examples of hand written digits taken from the de Campos et al paper on Character Recognition in Natural Images.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/decampos_digits/" + ], + "size": 2031872 + }, + "three_phase_oil_flow": { + "files": [ + [ + "DataTrnLbls.txt", + "DataTrn.txt", + "DataTst.txt", + "DataTstLbls.txt", + "DataVdn.txt", + "DataVdnLbls.txt" + ] + ], + "license": null, + "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", + "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/" + ], + "size": 712796 + }, + "pumadyn-32nm": { + "files": [ + [ + "pumadyn-32nm.tar.gz" + ] + ], + "license": "Data is made available by the Delve system at the University of Toronto", + "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", + "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", + "urls": [ + "ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/" + ], + "size": 5861646 + }, + "epomeo_gpx": { + "files": [ + [ + "endomondo_1.gpx", + "endomondo_2.gpx", + "garmin_watch_via_endomondo.gpx", + "viewranger_phone.gpx", + "viewranger_tablet.gpx" + ] + ], + "license": null, + "citation": "", + "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/" + ], + "size": 2031872 + }, + "ankur_pose_data": { + "files": [ + [ + "ankurDataPoseSilhouette.mat" + ] + ], + "license": null, + "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", + "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/" + ], + "size": 1 + }, + "isomap_face_data": { + "files": [ + [ + "face_data.mat" + ] + ], + "license": null, + "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", + "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/" + ], + "size": 24229368 + }, + "brendan_faces": { + "files": [ + [ + "frey_rawface.mat" + ] + ], + "license": null, + "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", + "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", + "urls": [ + "http://www.cs.nyu.edu/~roweis/data/" + ], + "size": 1100584 + }, + "olympic_marathon_men": { + "files": [ + [ + "olympicMarathonTimes.csv" + ] + ], + "license": null, + "citation": null, + "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/" + ], + "size": 584 + }, + "hapmap3": { + "files": [ + [ + "hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", + "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", + "relationships_w_pops_121708.txt" + ] + ], + "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", + "citation": "Gibbs, Richard A., et al. \"The international HapMap project.\" Nature 426.6968 (2003): 789-796.", + "details": "\n HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", + "urls": [ + "http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/" + ], + "size": 3458246739 + }, + "boxjenkins_airline": { + "files": [ + [ + "boxjenkins_airline.csv" + ] + ], + "license": "You may copy and redistribute the data. You may make derivative works from the data. You may use the data for commercial purposes. You may not sublicence the data when redistributing it. You may not redistribute the data under a different license. Source attribution on any use of this data: Must refer source.", + "citation": "Box & Jenkins (1976), in file: data/airpass, Description: International airline passengers: monthly totals in thousands. Jan 49 – Dec 60", + "details": "International airline passengers, monthly totals from January 1949 to December 1960.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/boxjenkins_airline/" + ], + "size": 46779 + }, + "osu_accad": { + "files": [ + [ + "swagger1TXT.ZIP", + "handspring1TXT.ZIP", + "quickwalkTXT.ZIP", + "run1TXT.ZIP", + "sprintTXT.ZIP", + "dogwalkTXT.ZIP", + "camper_04TXT.ZIP", + "dance_KB3_TXT.ZIP", + "per20_TXT.ZIP", + "perTWO07_TXT.ZIP", + "perTWO13_TXT.ZIP", + "perTWO14_TXT.ZIP", + "perTWO15_TXT.ZIP", + "perTWO16_TXT.ZIP" + ], + [ + "connections.txt" + ] + ], + "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", + "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", + "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", + "urls": [ + "http://accad.osu.edu/research/mocap/data/", + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" + ], + "size": 15922790 + }, + "xw_pen": { + "files": [ + [ + "xw_pen_15.csv" + ] + ], + "license": null, + "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", + "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/" + ], + "size": 3410 + } +} From a163bf985e285fdfc0960175e968317689d5062e Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 09:13:30 +0100 Subject: [PATCH 18/43] [data] edit json file directly, removed datasets.py and data_resources_create --- GPy/util/datasets.py | 1127 -------------------- GPy/util/datasets/data_resources_create.py | 176 --- 2 files changed, 1303 deletions(-) delete mode 100644 GPy/util/datasets.py delete mode 100644 GPy/util/datasets/data_resources_create.py diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py deleted file mode 100644 index 02c5cdb9..00000000 --- a/GPy/util/datasets.py +++ /dev/null @@ -1,1127 +0,0 @@ -import csv -import os -import copy -import numpy as np -import pylab as pb -import GPy -import scipy.io -import cPickle as pickle -import zipfile -import tarfile -import datetime -import json -import re - -ipython_available=True -try: - import IPython -except ImportError: - ipython_available=False - - -import sys, urllib2 - -def reporthook(a,b,c): - # ',' at the end of the line is important! - #print "% 3.1f%% of %d bytes\r" % (min(100, float(a * b) / c * 100), c), - #you can also use sys.stdout.write - sys.stdout.write("\r% 3.1f%% of %d bytes" % (min(100, float(a * b) / c * 100), c)) - sys.stdout.flush() - -# Global variables -data_path = os.path.join(os.path.dirname(__file__), 'datasets') -default_seed = 10000 -overide_manual_authorize=False -neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' - -# Read data resources from json file. -# Don't do this when ReadTheDocs is scanning as it breaks things -on_rtd = os.environ.get('READTHEDOCS', None) == 'True' #Checks if RTD is scanning - -if not (on_rtd): - path = os.path.join(os.path.dirname(__file__), 'data_resources.json') - json_data=open(path).read() - data_resources = json.loads(json_data) - -if not (on_rtd): - path = os.path.join(os.path.dirname(__file__), 'football_teams.json') - json_data=open(path).read() - football_dict = json.loads(json_data) - - - -def prompt_user(prompt): - """Ask user for agreeing to data set licenses.""" - # raw_input returns the empty string for "enter" - yes = set(['yes', 'y']) - no = set(['no','n']) - - try: - print(prompt) - choice = raw_input().lower() - # would like to test for exception here, but not sure if we can do that without importing IPython - except: - print('Stdin is not implemented.') - print('You need to set') - print('overide_manual_authorize=True') - print('to proceed with the download. Please set that variable and continue.') - raise - - - if choice in yes: - return True - elif choice in no: - return False - else: - print("Your response was a " + choice) - print("Please respond with 'yes', 'y' or 'no', 'n'") - #return prompt_user() - - -def data_available(dataset_name=None): - """Check if the data set is available on the local machine already.""" - for file_list in data_resources[dataset_name]['files']: - for file in file_list: - if not os.path.exists(os.path.join(data_path, dataset_name, file)): - return False - return True - -def download_url(url, store_directory, save_name = None, messages = True, suffix=''): - """Download a file from a url and save it to disk.""" - i = url.rfind('/') - file = url[i+1:] - print file - dir_name = os.path.join(data_path, store_directory) - save_name = os.path.join(dir_name, file) - print "Downloading ", url, "->", os.path.join(store_directory, file) - if not os.path.exists(dir_name): - os.makedirs(dir_name) - try: - response = urllib2.urlopen(url+suffix) - except urllib2.URLError, e: - if not hasattr(e, "code"): - raise - response = e - if response.code > 399 and response.code<500: - raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code)) - elif response.code > 499: - raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code)) - with open(save_name, 'wb') as f: - meta = response.info() - file_size = int(meta.getheaders("Content-Length")[0]) - status = "" - file_size_dl = 0 - block_sz = 8192 - line_length=30 - while True: - buff = response.read(block_sz) - if not buff: - break - file_size_dl += len(buff) - f.write(buff) - sys.stdout.write(" "*(len(status)) + "\r") - status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), - full=file_size/(1.*1e6), ll=line_length, - perc="="*int(line_length*float(file_size_dl)/file_size)) - sys.stdout.write(status) - sys.stdout.flush() - sys.stdout.write(" "*(len(status)) + "\r") - print status - # if we wanted to get more sophisticated maybe we should check the response code here again even for successes. - #with open(save_name, 'wb') as f: - # f.write(response.read()) - - #urllib.urlretrieve(url+suffix, save_name, reporthook) - -def authorize_download(dataset_name=None): - """Check with the user that the are happy with terms and conditions for the data set.""" - print('Acquiring resource: ' + dataset_name) - # TODO, check resource is in dictionary! - print('') - dr = data_resources[dataset_name] - print('Details of data: ') - print(dr['details']) - print('') - if dr['citation']: - print('Please cite:') - print(dr['citation']) - print('') - if dr['size']: - print('After downloading the data will take up ' + str(dr['size']) + ' bytes of space.') - print('') - print('Data will be stored in ' + os.path.join(data_path, dataset_name) + '.') - print('') - if overide_manual_authorize: - if dr['license']: - print('You have agreed to the following license:') - print(dr['license']) - print('') - return True - else: - if dr['license']: - print('You must also agree to the following license:') - print(dr['license']) - print('') - return prompt_user('Do you wish to proceed with the download? [yes/no]') - -def download_data(dataset_name=None): - """Check with the user that the are happy with terms and conditions for the data set, then download it.""" - - dr = data_resources[dataset_name] - if not authorize_download(dataset_name): - raise Exception("Permission to download data set denied.") - - if dr.has_key('suffices'): - for url, files, suffices in zip(dr['urls'], dr['files'], dr['suffices']): - for file, suffix in zip(files, suffices): - download_url(os.path.join(url,file), dataset_name, dataset_name, suffix=suffix) - else: - for url, files in zip(dr['urls'], dr['files']): - for file in files: - download_url(os.path.join(url,file), dataset_name, dataset_name) - return True - -def data_details_return(data, data_set): - """Update the data component of the data dictionary with details drawn from the data_resources.""" - data.update(data_resources[data_set]) - return data - - -def cmu_urls_files(subj_motions, messages = True): - ''' - Find which resources are missing on the local disk for the requested CMU motion capture motions. - ''' - dr = data_resources['cmu_mocap_full'] - cmu_url = dr['urls'][0] - - subjects_num = subj_motions[0] - motions_num = subj_motions[1] - - resource = {'urls' : [], 'files' : []} - # Convert numbers to strings - subjects = [] - motions = [list() for _ in range(len(subjects_num))] - for i in range(len(subjects_num)): - curSubj = str(int(subjects_num[i])) - if int(subjects_num[i]) < 10: - curSubj = '0' + curSubj - subjects.append(curSubj) - for j in range(len(motions_num[i])): - curMot = str(int(motions_num[i][j])) - if int(motions_num[i][j]) < 10: - curMot = '0' + curMot - motions[i].append(curMot) - - all_skels = [] - - assert len(subjects) == len(motions) - - all_motions = [] - - for i in range(len(subjects)): - skel_dir = os.path.join(data_path, 'cmu_mocap') - cur_skel_file = os.path.join(skel_dir, subjects[i] + '.asf') - - url_required = False - file_download = [] - if not os.path.exists(cur_skel_file): - # Current skel file doesn't exist. - if not os.path.isdir(skel_dir): - os.mkdir(skel_dir) - # Add skel file to list. - url_required = True - file_download.append(subjects[i] + '.asf') - for j in range(len(motions[i])): - file_name = subjects[i] + '_' + motions[i][j] + '.amc' - cur_motion_file = os.path.join(skel_dir, file_name) - if not os.path.exists(cur_motion_file): - url_required = True - file_download.append(subjects[i] + '_' + motions[i][j] + '.amc') - if url_required: - resource['urls'].append(cmu_url + '/' + subjects[i] + '/') - resource['files'].append(file_download) - return resource - -try: - import gpxpy - import gpxpy.gpx - gpxpy_available = True - -except ImportError: - gpxpy_available = False - -if gpxpy_available: - def epomeo_gpx(data_set='epomeo_gpx', sample_every=4): - if not data_available(data_set): - download_data(data_set) - files = ['endomondo_1', 'endomondo_2', 'garmin_watch_via_endomondo','viewranger_phone', 'viewranger_tablet'] - - X = [] - for file in files: - gpx_file = open(os.path.join(data_path, 'epomeo_gpx', file + '.gpx'), 'r') - - gpx = gpxpy.parse(gpx_file) - segment = gpx.tracks[0].segments[0] - points = [point for track in gpx.tracks for segment in track.segments for point in segment.points] - data = [[(point.time-datetime.datetime(2013,8,21)).total_seconds(), point.latitude, point.longitude, point.elevation] for point in points] - X.append(np.asarray(data)[::sample_every, :]) - gpx_file.close() - return data_details_return({'X' : X, 'info' : 'Data is an array containing time in seconds, latitude, longitude and elevation in that order.'}, data_set) - -#del gpxpy_available - - - -# Some general utilities. -def sample_class(f): - p = 1. / (1. + np.exp(-f)) - c = np.random.binomial(1, p) - c = np.where(c, 1, -1) - return c - -def boston_housing(data_set='boston_housing'): - if not data_available(data_set): - download_data(data_set) - all_data = np.genfromtxt(os.path.join(data_path, data_set, 'housing.data')) - X = all_data[:, 0:13] - Y = all_data[:, 13:14] - return data_details_return({'X' : X, 'Y': Y}, data_set) - -def brendan_faces(data_set='brendan_faces'): - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'frey_rawface.mat')) - Y = mat_data['ff'].T - return data_details_return({'Y': Y}, data_set) - -def della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=None): - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'DellaGattadata.mat')) - X = np.double(mat_data['timepoints']) - if gene_number == None: - Y = mat_data['exprs_tp53_RMA'] - else: - Y = mat_data['exprs_tp53_RMA'][:, gene_number] - if len(Y.shape) == 1: - Y = Y[:, None] - return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set) - - - -def football_data(season='1314', data_set='football_data'): - """Football data from English games since 1993. This downloads data from football-data.co.uk for the given season. """ - def league2num(string): - league_dict = {'E0':0, 'E1':1, 'E2': 2, 'E3': 3, 'EC':4} - return league_dict[string] - - def football2num(string): - if football_dict.has_key(string): - return football_dict[string] - else: - football_dict[string] = len(football_dict)+1 - return len(football_dict)+1 - - data_set_season = data_set + '_' + season - data_resources[data_set_season] = copy.deepcopy(data_resources[data_set]) - data_resources[data_set_season]['urls'][0]+=season + '/' - start_year = int(season[0:2]) - end_year = int(season[2:4]) - files = ['E0.csv', 'E1.csv', 'E2.csv', 'E3.csv'] - if start_year>4 and start_year < 93: - files += ['EC.csv'] - data_resources[data_set_season]['files'] = [files] - if not data_available(data_set_season): - download_data(data_set_season) - for file in reversed(files): - filename = os.path.join(data_path, data_set_season, file) - # rewrite files removing blank rows. - writename = os.path.join(data_path, data_set_season, 'temp.csv') - input = open(filename, 'rb') - output = open(writename, 'wb') - writer = csv.writer(output) - for row in csv.reader(input): - if any(field.strip() for field in row): - writer.writerow(row) - input.close() - output.close() - table = np.loadtxt(writename,skiprows=1, usecols=(0, 1, 2, 3, 4, 5), converters = {0: league2num, 1: pb.datestr2num, 2:football2num, 3:football2num}, delimiter=',') - X = table[:, :4] - Y = table[:, 4:] - return data_details_return({'X': X, 'Y': Y}, data_set) - -# This will be for downloading google trends data. -def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends'): - """Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations.""" - # Inspired by this notebook: - # http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb - - # quote the query terms. - for i, element in enumerate(query_terms): - query_terms[i] = urllib2.quote(element) - query = 'http://www.google.com/trends/fetchComponent?q=%s&cid=TIMESERIES_GRAPH_0&export=3' % ",".join(query_terms) - - data = urllib2.urlopen(query).read() - - # In the notebook they did some data cleaning: remove Javascript header+footer, and translate new Date(....,..,..) into YYYY-MM-DD. - header = """// Data table response\ngoogle.visualization.Query.setResponse(""" - data = data[len(header):-2] - data = re.sub('new Date\((\d+),(\d+),(\d+)\)', (lambda m: '"%s-%02d-%02d"' % (m.group(1).strip(), 1+int(m.group(2)), int(m.group(3)))), data) - timeseries = json.loads(data) - #import pandas as pd - columns = [k['label'] for k in timeseries['table']['cols']] - rows = map(lambda x: [k['v'] for k in x['c']], timeseries['table']['rows']) - terms = len(columns)-1 - X = np.asarray([(pb.datestr2num(row[0]), i) for i in range(terms) for row in rows ]) - Y = np.asarray([[row[i+1]] for i in range(terms) for row in rows ]) - output_info = columns[1:] - return data_details_return({'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set) - -# The data sets -def oil(data_set='three_phase_oil_flow'): - """The three phase oil data from Bishop and James (1993).""" - if not data_available(data_set): - download_data(data_set) - oil_train_file = os.path.join(data_path, data_set, 'DataTrn.txt') - oil_trainlbls_file = os.path.join(data_path, data_set, 'DataTrnLbls.txt') - oil_test_file = os.path.join(data_path, data_set, 'DataTst.txt') - oil_testlbls_file = os.path.join(data_path, data_set, 'DataTstLbls.txt') - oil_valid_file = os.path.join(data_path, data_set, 'DataVdn.txt') - oil_validlbls_file = os.path.join(data_path, data_set, 'DataVdnLbls.txt') - fid = open(oil_train_file) - X = np.fromfile(fid, sep='\t').reshape((-1, 12)) - fid.close() - fid = open(oil_test_file) - Xtest = np.fromfile(fid, sep='\t').reshape((-1, 12)) - fid.close() - fid = open(oil_valid_file) - Xvalid = np.fromfile(fid, sep='\t').reshape((-1, 12)) - fid.close() - fid = open(oil_trainlbls_file) - Y = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1. - fid.close() - fid = open(oil_testlbls_file) - Ytest = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1. - fid.close() - fid = open(oil_validlbls_file) - Yvalid = np.fromfile(fid, sep='\t').reshape((-1, 3)) * 2. - 1. - fid.close() - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'Xtest' : Xtest, 'Xvalid': Xvalid, 'Yvalid': Yvalid}, data_set) - #else: - # throw an error - -def oil_100(seed=default_seed, data_set = 'three_phase_oil_flow'): - np.random.seed(seed=seed) - data = oil() - indices = np.random.permutation(1000) - indices = indices[0:100] - X = data['X'][indices, :] - Y = data['Y'][indices, :] - return data_details_return({'X': X, 'Y': Y, 'info': "Subsample of the full oil data extracting 100 values randomly without replacement, here seed was " + str(seed)}, data_set) - -def pumadyn(seed=default_seed, data_set='pumadyn-32nm'): - if not data_available(data_set): - download_data(data_set) - path = os.path.join(data_path, data_set) - tar = tarfile.open(os.path.join(path, 'pumadyn-32nm.tar.gz')) - print('Extracting file.') - tar.extractall(path=path) - tar.close() - # Data is variance 1, no need to normalize. - data = np.loadtxt(os.path.join(data_path, data_set, 'pumadyn-32nm', 'Dataset.data.gz')) - indices = np.random.permutation(data.shape[0]) - indicesTrain = indices[0:7168] - indicesTest = indices[7168:-1] - indicesTrain.sort(axis=0) - indicesTest.sort(axis=0) - X = data[indicesTrain, 0:-2] - Y = data[indicesTrain, -1][:, None] - Xtest = data[indicesTest, 0:-2] - Ytest = data[indicesTest, -1][:, None] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed': seed}, data_set) - -def robot_wireless(data_set='robot_wireless'): - # WiFi access point strengths on a tour around UW Paul Allen building. - if not data_available(data_set): - download_data(data_set) - file_name = os.path.join(data_path, data_set, 'uw-floor.txt') - all_time = np.genfromtxt(file_name, usecols=(0)) - macaddress = np.genfromtxt(file_name, usecols=(1), dtype='string') - x = np.genfromtxt(file_name, usecols=(2)) - y = np.genfromtxt(file_name, usecols=(3)) - strength = np.genfromtxt(file_name, usecols=(4)) - addresses = np.unique(macaddress) - times = np.unique(all_time) - addresses.sort() - times.sort() - allY = np.zeros((len(times), len(addresses))) - allX = np.zeros((len(times), 2)) - allY[:]=-92. - strengths={} - for address, j in zip(addresses, range(len(addresses))): - ind = np.nonzero(address==macaddress) - temp_strengths=strength[ind] - temp_x=x[ind] - temp_y=y[ind] - temp_times = all_time[ind] - for time in temp_times: - vals = time==temp_times - if any(vals): - ind2 = np.nonzero(vals) - i = np.nonzero(time==times) - allY[i, j] = temp_strengths[ind2] - allX[i, 0] = temp_x[ind2] - allX[i, 1] = temp_y[ind2] - allY = (allY + 85.)/15. - - X = allX[0:215, :] - Y = allY[0:215, :] - - Xtest = allX[215:, :] - Ytest = allY[215:, :] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'addresses' : addresses, 'times' : times}, data_set) - -def silhouette(data_set='ankur_pose_data'): - # Ankur Agarwal and Bill Trigg's silhoutte data. - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'ankurDataPoseSilhouette.mat')) - inMean = np.mean(mat_data['Y']) - inScales = np.sqrt(np.var(mat_data['Y'])) - X = mat_data['Y'] - inMean - X = X / inScales - Xtest = mat_data['Y_test'] - inMean - Xtest = Xtest / inScales - Y = mat_data['Z'] - Ytest = mat_data['Z_test'] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest}, data_set) - -def decampos_digits(data_set='decampos_characters', which_digits=[0,1,2,3,4,5,6,7,8,9]): - if not data_available(data_set): - download_data(data_set) - path = os.path.join(data_path, data_set) - digits = np.load(os.path.join(path, 'digits.npy')) - digits = digits[which_digits,:,:,:] - num_classes, num_samples, height, width = digits.shape - Y = digits.reshape((digits.shape[0]*digits.shape[1],digits.shape[2]*digits.shape[3])) - lbls = np.array([[l]*num_samples for l in which_digits]).reshape(Y.shape[0], 1) - str_lbls = np.array([[str(l)]*num_samples for l in which_digits]) - return data_details_return({'Y': Y, 'lbls': lbls, 'str_lbls' : str_lbls, 'info': 'Digits data set from the de Campos characters data'}, data_set) - -def ripley_synth(data_set='ripley_prnn_data'): - if not data_available(data_set): - download_data(data_set) - train = np.genfromtxt(os.path.join(data_path, data_set, 'synth.tr'), skip_header=1) - X = train[:, 0:2] - y = train[:, 2:3] - test = np.genfromtxt(os.path.join(data_path, data_set, 'synth.te'), skip_header=1) - Xtest = test[:, 0:2] - ytest = test[:, 2:3] - return data_details_return({'X': X, 'Y': y, 'Xtest': Xtest, 'Ytest': ytest, 'info': 'Synthetic data generated by Ripley for a two class classification problem.'}, data_set) - -def mauna_loa(data_set='mauna_loa', num_train=543, refresh_data=False): - path = os.path.join(data_path, data_set) - if data_available(data_set) and not refresh_data: - print 'Using cached version of the data set, to use latest version set refresh_data to True' - else: - download_data(data_set) - data = np.loadtxt(os.path.join(data_path, data_set, 'co2_mm_mlo.txt')) - print 'Most recent data observation from month ', data[-1, 1], ' in year ', data[-1, 0] - allX = data[data[:, 3]!=-99.99, 2:3] - allY = data[data[:, 3]!=-99.99, 3:4] - X = allX[:num_train, 0:1] - Xtest = allX[num_train:, 0:1] - Y = allY[:num_train, 0:1] - Ytest = allY[num_train:, 0:1] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Mauna Loa data with " + str(num_train) + " values used as training points."}, data_set) - - -def boxjenkins_airline(data_set='boxjenkins_airline', num_train=96): - path = os.path.join(data_path, data_set) - if not data_available(data_set): - download_data(data_set) - data = np.loadtxt(os.path.join(data_path, data_set, 'boxjenkins_airline.csv'), delimiter=',') - Y = data[:num_train, 1:2] - X = data[:num_train, 0:1] - Xtest = data[num_train:, 0:1] - Ytest = data[num_train:, 1:2] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Montly airline passenger data from Box & Jenkins 1976."}, data_set) - - -def osu_run1(data_set='osu_run1', sample_every=4): - path = os.path.join(data_path, data_set) - if not data_available(data_set): - download_data(data_set) - zip = zipfile.ZipFile(os.path.join(data_path, data_set, 'run1TXT.ZIP'), 'r') - for name in zip.namelist(): - zip.extract(name, path) - Y, connect = GPy.util.mocap.load_text_data('Aug210106', path) - Y = Y[0:-1:sample_every, :] - return data_details_return({'Y': Y, 'connect' : connect}, data_set) - -def swiss_roll_generated(num_samples=1000, sigma=0.0): - with open(os.path.join(data_path, 'swiss_roll.pickle')) as f: - data = pickle.load(f) - Na = data['Y'].shape[0] - perm = np.random.permutation(np.r_[:Na])[:num_samples] - Y = data['Y'][perm, :] - t = data['t'][perm] - c = data['colors'][perm, :] - so = np.argsort(t) - Y = Y[so, :] - t = t[so] - c = c[so, :] - return {'Y':Y, 't':t, 'colors':c} - -def hapmap3(data_set='hapmap3'): - """ - The HapMap phase three SNP dataset - 1184 samples out of 11 populations. - - SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: - Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then - - / 1, iff SNPij==(B1,B1) - Aij = | 0, iff SNPij==(B1,B2) - \ -1, iff SNPij==(B2,B2) - - The SNP data and the meta information (such as iid, sex and phenotype) are - stored in the dataframe datadf, index is the Individual ID, - with following columns for metainfo: - - * family_id -> Family ID - * paternal_id -> Paternal ID - * maternal_id -> Maternal ID - * sex -> Sex (1=male; 2=female; other=unknown) - * phenotype -> Phenotype (-9, or 0 for unknown) - * population -> Population string (e.g. 'ASW' - 'YRI') - * rest are SNP rs (ids) - - More information is given in infodf: - - * Chromosome: - - autosomal chromosemes -> 1-22 - - X X chromosome -> 23 - - Y Y chromosome -> 24 - - XY Pseudo-autosomal region of X -> 25 - - MT Mitochondrial -> 26 - * Relative Positon (to Chromosome) [base pairs] - """ - try: - from pandas import read_pickle, DataFrame - from sys import stdout - import bz2 - except ImportError as i: - raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset" - if not data_available(data_set): - download_data(data_set) - dirpath = os.path.join(data_path,'hapmap3') - hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly' - preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \ - ['.snps.pickle', - '.info.pickle', - '.nan.pickle']] - if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)): - if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB " - "of memory and can take a (very) long time, continue? [Y/n]"): - print "Preprocessing required for further usage." - return - status = "Preprocessing data, please be patient..." - print status - def write_status(message, progress, status): - stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush() - status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20, - perc="="*int(20.*progress/100.)) - stdout.write(status); stdout.flush() - return status - unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']] - if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)): - status=write_status('unpacking...', 0, '') - curr = 0 - for newfilepath in unpacked_files: - if not os.path.exists(newfilepath): - filepath = newfilepath + '.bz2' - file_size = os.path.getsize(filepath) - with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f: - decomp = bz2.BZ2Decompressor() - file_processed = 0 - buffsize = 100 * 1024 - for data in iter(lambda : f.read(buffsize), b''): - new_file.write(decomp.decompress(data)) - file_processed += len(data) - status=write_status('unpacking...', curr+12.*file_processed/(file_size), status) - curr += 12 - status=write_status('unpacking...', curr, status) - status=write_status('reading .ped...', 25, status) - # Preprocess data: - snpstrnp = np.loadtxt(unpacked_files[0], dtype=str) - status=write_status('reading .map...', 33, status) - mapnp = np.loadtxt(unpacked_files[1], dtype=str) - status=write_status('reading relationships.txt...', 42, status) - # and metainfo: - infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t') - infodf.set_index('IID', inplace=1) - status=write_status('filtering nan...', 45, status) - snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2) - inan = snpstr[:,:,0] == '0' - status=write_status('filtering reference alleles...', 55, status) - ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:])) - status=write_status('encoding snps...', 70, status) - # Encode the information for each gene in {-1,0,1}: - status=write_status('encoding snps...', 73, status) - snps = (snpstr==ref[None,:,:]) - status=write_status('encoding snps...', 76, status) - snps = (snps*np.array([1,-1])[None,None,:]) - status=write_status('encoding snps...', 78, status) - snps = snps.sum(-1) - status=write_status('encoding snps...', 81, status) - snps = snps.astype('i8') - status=write_status('marking nan values...', 88, status) - # put in nan values (masked as -128): - snps[inan] = -128 - status=write_status('setting up meta...', 94, status) - # get meta information: - metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']] - metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6]) - metadf.set_index('iid', inplace=1) - metadf = metadf.join(infodf.population) - metadf.to_pickle(preprocessed_data_paths[1]) - # put everything together: - status=write_status('setting up snps...', 96, status) - snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1]) - with open(preprocessed_data_paths[0], 'wb') as f: - pickle.dump(f, snpsdf, protocoll=-1) - status=write_status('setting up snps...', 98, status) - inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1]) - inandf.to_pickle(preprocessed_data_paths[2]) - status=write_status('done :)', 100, status) - print '' - else: - print "loading snps..." - snpsdf = read_pickle(preprocessed_data_paths[0]) - print "loading metainfo..." - metadf = read_pickle(preprocessed_data_paths[1]) - print "loading nan entries..." - inandf = read_pickle(preprocessed_data_paths[2]) - snps = snpsdf.values - populations = metadf.population.values.astype('S3') - hapmap = dict(name=data_set, - description='The HapMap phase three SNP dataset - ' - '1184 samples out of 11 populations. inan is a ' - 'boolean array, containing wheather or not the ' - 'given entry is nan (nans are masked as ' - '-128 in snps).', - snpsdf=snpsdf, - metadf=metadf, - snps=snps, - inan=inandf.values, - inandf=inandf, - populations=populations) - return hapmap - -def singlecell(data_set='singlecell'): - if not data_available(data_set): - download_data(data_set) - dirpath = os.path.join(data_path, data_set) - data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) - genes = data[0, 1:] - labels = data[1:, 0] - Y = np.array(data[1:, 1:], dtype=float) - return data_details_return({'Y': Y, 'info' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", - 'genes':genes, 'labels':labels, - }, data_set) - -def swiss_roll_1000(): - return swiss_roll(num_samples=1000) - -def swiss_roll(num_samples=3000, data_set='swiss_roll'): - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'swiss_roll_data.mat')) - Y = mat_data['X_data'][:, 0:num_samples].transpose() - return data_details_return({'Y': Y, 'X': mat_data['X_data'], 'info': "The first " + str(num_samples) + " points from the swiss roll data of Tennenbaum, de Silva and Langford (2001)."}, data_set) - -def isomap_faces(num_samples=698, data_set='isomap_face_data'): - if not data_available(data_set): - download_data(data_set) - mat_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'face_data.mat')) - Y = mat_data['images'][:, 0:num_samples].transpose() - return data_details_return({'Y': Y, 'poses' : mat_data['poses'], 'lights': mat_data['lights'], 'info': "The first " + str(num_samples) + " points from the face data of Tennenbaum, de Silva and Langford (2001)."}, data_set) - -def simulation_BGPLVM(): - mat_data = scipy.io.loadmat(os.path.join(data_path, 'BGPLVMSimulation.mat')) - Y = np.array(mat_data['Y'], dtype=float) - S = np.array(mat_data['initS'], dtype=float) - mu = np.array(mat_data['initMu'], dtype=float) - #return data_details_return({'S': S, 'Y': Y, 'mu': mu}, data_set) - return {'Y': Y, 'S': S, - 'mu' : mu, - 'info': "Simulated test dataset generated in MATLAB to compare BGPLVM between python and MATLAB"} - -def toy_rbf_1d(seed=default_seed, num_samples=500): - """ - Samples values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1. - - :param seed: seed to use for random sampling. - :type seed: int - :param num_samples: number of samples to sample in the function (default 500). - :type num_samples: int - - """ - np.random.seed(seed=seed) - num_in = 1 - X = np.random.uniform(low= -1.0, high=1.0, size=(num_samples, num_in)) - X.sort(axis=0) - rbf = GPy.kern.RBF(num_in, variance=1., lengthscale=np.array((0.25,))) - white = GPy.kern.White(num_in, variance=1e-2) - kernel = rbf + white - K = kernel.K(X) - y = np.reshape(np.random.multivariate_normal(np.zeros(num_samples), K), (num_samples, 1)) - return {'X':X, 'Y':y, 'info': "Sampled " + str(num_samples) + " values of a function from an RBF covariance with very small noise for inputs uniformly distributed between -1 and 1."} - -def toy_rbf_1d_50(seed=default_seed): - np.random.seed(seed=seed) - data = toy_rbf_1d() - indices = np.random.permutation(data['X'].shape[0]) - indices = indices[0:50] - indices.sort(axis=0) - X = data['X'][indices, :] - Y = data['Y'][indices, :] - return {'X': X, 'Y': Y, 'info': "Subsamples the toy_rbf_sample with 50 values randomly taken from the original sample.", 'seed' : seed} - - -def toy_linear_1d_classification(seed=default_seed): - np.random.seed(seed=seed) - x1 = np.random.normal(-3, 5, 20) - x2 = np.random.normal(3, 5, 20) - X = (np.r_[x1, x2])[:, None] - return {'X': X, 'Y': sample_class(2.*X), 'F': 2.*X, 'seed' : seed} - -def olivetti_glasses(data_set='olivetti_glasses', num_training=200, seed=default_seed): - path = os.path.join(data_path, data_set) - if not data_available(data_set): - download_data(data_set) - y = np.load(os.path.join(path, 'has_glasses.np')) - y = np.where(y=='y',1,0).reshape(-1,1) - faces = scipy.io.loadmat(os.path.join(path, 'olivettifaces.mat'))['faces'].T - np.random.seed(seed=seed) - index = np.random.permutation(faces.shape[0]) - X = faces[index[:num_training],:] - Xtest = faces[index[num_training:],:] - Y = y[index[:num_training],:] - Ytest = y[index[num_training:]] - return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed' : seed, 'info': "ORL Faces with labels identifiying who is wearing glasses and who isn't. Data is randomly partitioned according to given seed. Presence or absence of glasses was labelled by James Hensman."}, 'olivetti_faces') - -def olivetti_faces(data_set='olivetti_faces'): - path = os.path.join(data_path, data_set) - if not data_available(data_set): - download_data(data_set) - zip = zipfile.ZipFile(os.path.join(path, 'att_faces.zip'), 'r') - for name in zip.namelist(): - zip.extract(name, path) - Y = [] - lbls = [] - for subject in range(40): - for image in range(10): - image_path = os.path.join(path, 'orl_faces', 's'+str(subject+1), str(image+1) + '.pgm') - Y.append(GPy.util.netpbmfile.imread(image_path).flatten()) - lbls.append(subject) - Y = np.asarray(Y) - lbls = np.asarray(lbls)[:, None] - return data_details_return({'Y': Y, 'lbls' : lbls, 'info': "ORL Faces processed to 64x64 images."}, data_set) - -def xw_pen(data_set='xw_pen'): - if not data_available(data_set): - download_data(data_set) - Y = np.loadtxt(os.path.join(data_path, data_set, 'xw_pen_15.csv'), delimiter=',') - X = np.arange(485)[:, None] - return data_details_return({'Y': Y, 'X': X, 'info': "Tilt data from a personalized digital assistant pen. Plot in original paper showed regression between time steps 175 and 275."}, data_set) - - -def download_rogers_girolami_data(data_set='rogers_girolami_data'): - if not data_available('rogers_girolami_data'): - download_data(data_set) - path = os.path.join(data_path, data_set) - tar_file = os.path.join(path, 'firstcoursemldata.tar.gz') - tar = tarfile.open(tar_file) - print('Extracting file.') - tar.extractall(path=path) - tar.close() - -def olympic_100m_men(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male100'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m men from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_100m_women(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female100'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Olympic sprint times for 100 m women from 1896 until 2008. Example is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_200m_women(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female200'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Olympic 200 m winning times for women from 1896 until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_200m_men(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male200'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Male 200 m winning times for women from 1896 until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_400m_women(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['female400'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Olympic 400 m winning times for women until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_400m_men(data_set='rogers_girolami_data'): - download_rogers_girolami_data() - olympic_data = scipy.io.loadmat(os.path.join(data_path, data_set, 'data', 'olympics.mat'))['male400'] - - X = olympic_data[:, 0][:, None] - Y = olympic_data[:, 1][:, None] - return data_details_return({'X': X, 'Y': Y, 'info': "Male 400 m winning times for women until 2008. Data is from Rogers and Girolami's First Course in Machine Learning."}, data_set) - -def olympic_marathon_men(data_set='olympic_marathon_men'): - if not data_available(data_set): - download_data(data_set) - olympics = np.genfromtxt(os.path.join(data_path, data_set, 'olympicMarathonTimes.csv'), delimiter=',') - X = olympics[:, 0:1] - Y = olympics[:, 1:2] - return data_details_return({'X': X, 'Y': Y}, data_set) - -def olympic_sprints(data_set='rogers_girolami_data'): - """All olympics sprint winning times for multiple output prediction.""" - X = np.zeros((0, 2)) - Y = np.zeros((0, 1)) - for i, dataset in enumerate([olympic_100m_men, - olympic_100m_women, - olympic_200m_men, - olympic_200m_women, - olympic_400m_men, - olympic_400m_women]): - data = dataset() - year = data['X'] - time = data['Y'] - X = np.vstack((X, np.hstack((year, np.ones_like(year)*i)))) - Y = np.vstack((Y, time)) - data['X'] = X - data['Y'] = Y - data['info'] = "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning." - return data_details_return({ - 'X': X, - 'Y': Y, - 'info': "Olympics sprint event winning for men and women to 2008. Data is from Rogers and Girolami's First Course in Machine Learning.", - 'output_info': { - 0:'100m Men', - 1:'100m Women', - 2:'200m Men', - 3:'200m Women', - 4:'400m Men', - 5:'400m Women'} - }, data_set) - -# def movielens_small(partNo=1,seed=default_seed): -# np.random.seed(seed=seed) - -# fileName = os.path.join(data_path, 'movielens', 'small', 'u' + str(partNo) + '.base') -# fid = open(fileName) -# uTrain = np.fromfile(fid, sep='\t', dtype=np.int16).reshape((-1, 4)) -# fid.close() -# maxVals = np.amax(uTrain, axis=0) -# numUsers = maxVals[0] -# numFilms = maxVals[1] -# numRatings = uTrain.shape[0] - -# Y = scipy.sparse.lil_matrix((numFilms, numUsers), dtype=np.int8) -# for i in range(numUsers): -# ind = pb.mlab.find(uTrain[:, 0]==i+1) -# Y[uTrain[ind, 1]-1, i] = uTrain[ind, 2] - -# fileName = os.path.join(data_path, 'movielens', 'small', 'u' + str(partNo) + '.test') -# fid = open(fileName) -# uTest = np.fromfile(fid, sep='\t', dtype=np.int16).reshape((-1, 4)) -# fid.close() -# numTestRatings = uTest.shape[0] - -# Ytest = scipy.sparse.lil_matrix((numFilms, numUsers), dtype=np.int8) -# for i in range(numUsers): -# ind = pb.mlab.find(uTest[:, 0]==i+1) -# Ytest[uTest[ind, 1]-1, i] = uTest[ind, 2] - -# lbls = np.empty((1,1)) -# lblstest = np.empty((1,1)) -# return {'Y':Y, 'lbls':lbls, 'Ytest':Ytest, 'lblstest':lblstest} - - -def crescent_data(num_data=200, seed=default_seed): - """ -Data set formed from a mixture of four Gaussians. In each class two of the Gaussians are elongated at right angles to each other and offset to form an approximation to the crescent data that is popular in semi-supervised learning as a toy problem. - - :param num_data_part: number of data to be sampled (default is 200). - :type num_data: int - :param seed: random seed to be used for data generation. - :type seed: int - - """ - np.random.seed(seed=seed) - sqrt2 = np.sqrt(2) - # Rotation matrix - R = np.array([[sqrt2 / 2, -sqrt2 / 2], [sqrt2 / 2, sqrt2 / 2]]) - # Scaling matrices - scales = [] - scales.append(np.array([[3, 0], [0, 1]])) - scales.append(np.array([[3, 0], [0, 1]])) - scales.append([[1, 0], [0, 3]]) - scales.append([[1, 0], [0, 3]]) - means = [] - means.append(np.array([4, 4])) - means.append(np.array([0, 4])) - means.append(np.array([-4, -4])) - means.append(np.array([0, -4])) - - Xparts = [] - num_data_part = [] - num_data_total = 0 - for i in range(0, 4): - num_data_part.append(round(((i + 1) * num_data) / 4.)) - num_data_part[i] -= num_data_total - part = np.random.normal(size=(num_data_part[i], 2)) - part = np.dot(np.dot(part, scales[i]), R) + means[i] - Xparts.append(part) - num_data_total += num_data_part[i] - X = np.vstack((Xparts[0], Xparts[1], Xparts[2], Xparts[3])) - - Y = np.vstack((np.ones((num_data_part[0] + num_data_part[1], 1)), -np.ones((num_data_part[2] + num_data_part[3], 1)))) - return {'X':X, 'Y':Y, 'info': "Two separate classes of data formed approximately in the shape of two crescents."} - -def creep_data(data_set='creep_rupture'): - """Brun and Yoshida's metal creep rupture data.""" - if not data_available(data_set): - download_data(data_set) - path = os.path.join(data_path, data_set) - tar_file = os.path.join(path, 'creeprupt.tar') - tar = tarfile.open(tar_file) - print('Extracting file.') - tar.extractall(path=path) - tar.close() - all_data = np.loadtxt(os.path.join(data_path, data_set, 'taka')) - y = all_data[:, 1:2].copy() - features = [0] - features.extend(range(2, 31)) - X = all_data[:, features].copy() - return data_details_return({'X': X, 'y': y}, data_set) - -def cmu_mocap_49_balance(data_set='cmu_mocap'): - """Load CMU subject 49's one legged balancing motion that was used by Alvarez, Luengo and Lawrence at AISTATS 2009.""" - train_motions = ['18', '19'] - test_motions = ['20'] - data = cmu_mocap('49', train_motions, test_motions, sample_every=4, data_set=data_set) - data['info'] = "One legged balancing motions from CMU data base subject 49. As used in Alvarez, Luengo and Lawrence at AISTATS 2009. It consists of " + data['info'] - return data - -def cmu_mocap_35_walk_jog(data_set='cmu_mocap'): - """Load CMU subject 35's walking and jogging motions, the same data that was used by Taylor, Roweis and Hinton at NIPS 2007. but without their preprocessing. Also used by Lawrence at AISTATS 2007.""" - train_motions = ['01', '02', '03', '04', '05', '06', - '07', '08', '09', '10', '11', '12', - '13', '14', '15', '16', '17', '19', - '20', '21', '22', '23', '24', '25', - '26', '28', '30', '31', '32', '33', '34'] - test_motions = ['18', '29'] - data = cmu_mocap('35', train_motions, test_motions, sample_every=4, data_set=data_set) - data['info'] = "Walk and jog data from CMU data base subject 35. As used in Tayor, Roweis and Hinton at NIPS 2007, but without their pre-processing (i.e. as used by Lawrence at AISTATS 2007). It consists of " + data['info'] - return data - -def cmu_mocap(subject, train_motions, test_motions=[], sample_every=4, data_set='cmu_mocap'): - """Load a given subject's training and test motions from the CMU motion capture data.""" - # Load in subject skeleton. - subject_dir = os.path.join(data_path, data_set) - - # Make sure the data is downloaded. - all_motions = train_motions + test_motions - resource = cmu_urls_files(([subject], [all_motions])) - data_resources[data_set] = data_resources['cmu_mocap_full'].copy() - data_resources[data_set]['files'] = resource['files'] - data_resources[data_set]['urls'] = resource['urls'] - if resource['urls']: - download_data(data_set) - - skel = GPy.util.mocap.acclaim_skeleton(os.path.join(subject_dir, subject + '.asf')) - - # Set up labels for each sequence - exlbls = np.eye(len(train_motions)) - - # Load sequences - tot_length = 0 - temp_Y = [] - temp_lbls = [] - for i in range(len(train_motions)): - temp_chan = skel.load_channels(os.path.join(subject_dir, subject + '_' + train_motions[i] + '.amc')) - temp_Y.append(temp_chan[::sample_every, :]) - temp_lbls.append(np.tile(exlbls[i, :], (temp_Y[i].shape[0], 1))) - tot_length += temp_Y[i].shape[0] - - Y = np.zeros((tot_length, temp_Y[0].shape[1])) - lbls = np.zeros((tot_length, temp_lbls[0].shape[1])) - - end_ind = 0 - for i in range(len(temp_Y)): - start_ind = end_ind - end_ind += temp_Y[i].shape[0] - Y[start_ind:end_ind, :] = temp_Y[i] - lbls[start_ind:end_ind, :] = temp_lbls[i] - if len(test_motions) > 0: - temp_Ytest = [] - temp_lblstest = [] - - testexlbls = np.eye(len(test_motions)) - tot_test_length = 0 - for i in range(len(test_motions)): - temp_chan = skel.load_channels(os.path.join(subject_dir, subject + '_' + test_motions[i] + '.amc')) - temp_Ytest.append(temp_chan[::sample_every, :]) - temp_lblstest.append(np.tile(testexlbls[i, :], (temp_Ytest[i].shape[0], 1))) - tot_test_length += temp_Ytest[i].shape[0] - - # Load test data - Ytest = np.zeros((tot_test_length, temp_Ytest[0].shape[1])) - lblstest = np.zeros((tot_test_length, temp_lblstest[0].shape[1])) - - end_ind = 0 - for i in range(len(temp_Ytest)): - start_ind = end_ind - end_ind += temp_Ytest[i].shape[0] - Ytest[start_ind:end_ind, :] = temp_Ytest[i] - lblstest[start_ind:end_ind, :] = temp_lblstest[i] - else: - Ytest = None - lblstest = None - - info = 'Subject: ' + subject + '. Training motions: ' - for motion in train_motions: - info += motion + ', ' - info = info[:-2] - if len(test_motions) > 0: - info += '. Test motions: ' - for motion in test_motions: - info += motion + ', ' - info = info[:-2] + '.' - else: - info += '.' - if sample_every != 1: - info += ' Data is sub-sampled to every ' + str(sample_every) + ' frames.' - return data_details_return({'Y': Y, 'lbls' : lbls, 'Ytest': Ytest, 'lblstest' : lblstest, 'info': info, 'skel': skel}, data_set) - - diff --git a/GPy/util/datasets/data_resources_create.py b/GPy/util/datasets/data_resources_create.py deleted file mode 100644 index 919e3ea4..00000000 --- a/GPy/util/datasets/data_resources_create.py +++ /dev/null @@ -1,176 +0,0 @@ -import json - -neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' -sam_url = 'http://www.cs.nyu.edu/~roweis/data/' -cmu_url = 'http://mocap.cs.cmu.edu/subjects/' - -data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'], - 'files' : [['ankurDataPoseSilhouette.mat']], - 'license' : None, - 'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""", - 'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""}, - - 'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'], - 'files' : [['Index', 'housing.data', 'housing.names']], - 'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""", - 'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""", - 'license' : None, - 'size' : 51276 - }, - 'brendan_faces' : {'urls' : [sam_url], - 'files': [['frey_rawface.mat']], - 'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.', - 'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""", - 'license': None, - 'size' : 1100584}, - 'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'], - 'files' : [['allasfamc.zip']], - 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.' - 'The database was created with funding from NSF EIA-0196217.""", - 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""", - 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""", - 'size' : None}, - 'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'], - 'files' : [['creeprupt.tar']], - 'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.', - 'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""", - 'license' : None, - 'size' : 602797}, - 'della_gatta' : {'urls' : [neil_url + 'della_gatta/'], - 'files': [['DellaGattadata.mat']], - 'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008', - 'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", - 'license':None, - 'size':3729650}, - 'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'], - 'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']], - 'citation' : '', - 'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", - 'license':None, - 'size': 2031872}, - 'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'], - 'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']], - 'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593', - 'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""", - 'license' : None, - 'size' : 712796}, - 'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'], - 'files' : [['firstcoursemldata.tar.gz']], - 'suffices' : [['?dl=1']], - 'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146', - 'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""", - 'license' : None, - 'size' : 21949154}, - 'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url], - 'files' : [['att_faces.zip'], ['olivettifaces.mat']], - 'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994', - 'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """, - 'license': None, - 'size' : 8561331}, - 'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'], - 'files' : [['olympicMarathonTimes.csv']], - 'citation' : None, - 'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""", - 'license': None, - 'size' : 584}, - 'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'], - 'files': [['run1TXT.ZIP'],['connections.txt']], - 'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", - 'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.', - 'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).', - 'size': 338103}, - 'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'], - 'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']], - 'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", - 'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.', - 'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).', - 'size': 15922790}, - 'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'], - 'files' : [['pumadyn-32nm.tar.gz']], - 'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""", - 'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""", - 'license' : """Data is made available by the Delve system at the University of Toronto""", - 'size' : 5861646}, - 'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'], - 'files' : [['uw-floor.txt']], - 'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""", - 'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""", - 'license' : None, - 'size' : 284390}, - 'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'], - 'files' : [['swiss_roll_data.mat']], - 'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""", - 'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000', - 'license' : None, - 'size' : 800256}, - 'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'], - 'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']], - 'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""", - 'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""", - 'license' : None, - 'size' : 93565}, - 'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'], - 'files' : [['face_data.mat']], - 'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""", - 'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000', - 'license' : None, - 'size' : 24229368}, - 'xw_pen' : {'urls' : [neil_url + 'xw_pen/'], - 'files' : [['xw_pen_15.csv']], - 'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""", - 'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005', - 'license' : None, - 'size' : 3410}, - 'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'], - 'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']], - 'details' : """ - HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. - The HapMap phase three SNP dataset - 1184 samples out of 11 populations. - See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details. - - SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: - Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then - - / 1, iff SNPij==(B1,B1) - Aij = | 0, iff SNPij==(B1,B2) - \ -1, iff SNPij==(B2,B2) - - The SNP data and the meta information (such as iid, sex and phenotype) are - stored in the dataframe datadf, index is the Individual ID, - with following columns for metainfo: - - * family_id -> Family ID - * paternal_id -> Paternal ID - * maternal_id -> Maternal ID - * sex -> Sex (1=male; 2=female; other=unknown) - * phenotype -> Phenotype (-9, or 0 for unknown) - * population -> Population string (e.g. 'ASW' - 'YRI') - * rest are SNP rs (ids) - - More information is given in infodf: - - * Chromosome: - - autosomal chromosemes -> 1-22 - - X X chromosome -> 23 - - Y Y chromosome -> 24 - - XY Pseudo-autosomal region of X -> 25 - - MT Mitochondrial -> 26 - * Relative Positon (to Chromosome) [base pairs] - - """, - 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""", - 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""", - 'size' : 2*1729092237 + 62265}, - - 'singlecell' : {'urls' : ["http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/"], - 'files' : [['singlecell.csv']], - 'details' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", - 'citation' : "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", - 'license' : "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", - 'size' : 233.1, - } - } - -with open('data_resources.json', 'w') as f: - print "writing data_resources" - json.dump(data_resources, f) From 2df0f933f1d5a18122765513b70666f67efcb6f3 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Mon, 12 May 2014 09:49:56 +0100 Subject: [PATCH 19/43] Changes to datasets.py --- GPy/util/data_resources.json | 497 ++++++++++++++++++++- GPy/util/datasets.py | 13 +- GPy/util/datasets/data_resources_create.py | 176 -------- 3 files changed, 508 insertions(+), 178 deletions(-) delete mode 100644 GPy/util/datasets/data_resources_create.py diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index a4a82edd..4615614d 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -1 +1,496 @@ -{"rogers_girolami_data": {"files": [["firstcoursemldata.tar.gz"]], "license": null, "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", "urls": ["https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/"], "suffices": [["?dl=1"]], "size": 21949154}, "ankur_pose_data": {"files": [["ankurDataPoseSilhouette.mat"]], "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", "license": null, "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/"], "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."}, "osu_accad": {"files": [["swagger1TXT.ZIP", "handspring1TXT.ZIP", "quickwalkTXT.ZIP", "run1TXT.ZIP", "sprintTXT.ZIP", "dogwalkTXT.ZIP", "camper_04TXT.ZIP", "dance_KB3_TXT.ZIP", "per20_TXT.ZIP", "perTWO07_TXT.ZIP", "perTWO13_TXT.ZIP", "perTWO14_TXT.ZIP", "perTWO15_TXT.ZIP", "perTWO16_TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 15922790}, "isomap_face_data": {"files": [["face_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/"], "size": 24229368}, "boston_housing": {"files": [["Index", "housing.data", "housing.names"]], "license": null, "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", "details": "The Boston Housing data relates house values in Boston to a range of input variables.", "urls": ["http://archive.ics.uci.edu/ml/machine-learning-databases/housing/"], "size": 51276}, "cmu_mocap_full": {"files": [["allasfamc.zip"]], "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'\n 'The database was created with funding from NSF EIA-0196217.", "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", "urls": ["http://mocap.cs.cmu.edu"], "size": null}, "brendan_faces": {"files": [["frey_rawface.mat"]], "license": null, "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", "urls": ["http://www.cs.nyu.edu/~roweis/data/"], "size": 1100584}, "singlecell": {"files": [["singlecell.csv"]], "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", "urls": ["http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/"], "size": 233.1}, "olympic_marathon_men": {"files": [["olympicMarathonTimes.csv"]], "license": null, "citation": null, "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/"], "size": 584}, "pumadyn-32nm": {"files": [["pumadyn-32nm.tar.gz"]], "license": "Data is made available by the Delve system at the University of Toronto", "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", "urls": ["ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/"], "size": 5861646}, "ripley_prnn_data": {"files": [["Cushings.dat", "README", "crabs.dat", "fglass.dat", "fglass.grp", "pima.te", "pima.tr", "pima.tr2", "synth.te", "synth.tr", "viruses.dat", "virus3.dat"]], "license": null, "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", "urls": ["http://www.stats.ox.ac.uk/pub/PRNN/"], "size": 93565}, "three_phase_oil_flow": {"files": [["DataTrnLbls.txt", "DataTrn.txt", "DataTst.txt", "DataTstLbls.txt", "DataVdn.txt", "DataVdnLbls.txt"]], "license": null, "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/"], "size": 712796}, "robot_wireless": {"files": [["uw-floor.txt"]], "license": null, "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/"], "size": 284390}, "xw_pen": {"files": [["xw_pen_15.csv"]], "license": null, "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/"], "size": 3410}, "swiss_roll": {"files": [["swiss_roll_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://isomap.stanford.edu/"], "size": 800256}, "osu_run1": {"files": [["run1TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 338103}, "creep_rupture": {"files": [["creeprupt.tar"]], "license": null, "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", "urls": ["http://www.msm.cam.ac.uk/map/data/tar/"], "size": 602797}, "hapmap3": {"files": [["hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", "relationships_w_pops_121708.txt"]], "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", "citation": "Gibbs, Richard A., et al. \"The international HapMap project.\" Nature 426.6968 (2003): 789-796.", "details": "\n HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", "urls": ["http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/"], "size": 3458246739}, "olivetti_faces": {"files": [["att_faces.zip"], ["olivettifaces.mat"]], "license": null, "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", "http://www.cs.nyu.edu/~roweis/data/"], "size": 8561331}, "della_gatta": {"files": [["DellaGattadata.mat"]], "license": null, "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/"], "size": 3729650}, "epomeo_gpx": {"files": [["endomondo_1.gpx", "endomondo_2.gpx", "garmin_watch_via_endomondo.gpx", "viewranger_phone.gpx", "viewranger_tablet.gpx"]], "license": null, "citation": "", "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/"], "size": 2031872}} \ No newline at end of file +{ + "ankur_pose_data": { + "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", + "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.", + "files": [ + [ + "ankurDataPoseSilhouette.mat" + ] + ], + "license": null, + "size": 1, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/" + ] + }, + "boston_housing": { + "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", + "details": "The Boston Housing data relates house values in Boston to a range of input variables.", + "files": [ + [ + "Index", + "housing.data", + "housing.names" + ] + ], + "license": null, + "size": 51276, + "urls": [ + "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/" + ] + }, + "boxjenkins_airline": { + "citation": "Box & Jenkins (1976), in file: data/airpass, Description: International airline passengers: monthly totals in thousands. Jan 49 \\u2013 Dec 60", + "details": "International airline passengers, monthly totals from January 1949 to December 1960.", + "files": [ + [ + "boxjenkins_airline.csv" + ] + ], + "license": "You may copy and redistribute the data. You may make derivative works from the data. You may use the data for commercial purposes. You may not sublicence the data when redistributing it. You may not redistribute the data under a different license. Source attribution on any use of this data: Must refer source.", + "size": 46779, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/boxjenkins_airline/" + ] + }, + "brendan_faces": { + "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", + "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", + "files": [ + [ + "frey_rawface.mat" + ] + ], + "license": null, + "size": 1100584, + "urls": [ + "http://www.cs.nyu.edu/~roweis/data/" + ] + }, + "cmu_mocap_full": { + "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\\nThe database was created with funding from NSF EIA-0196217.", + "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", + "files": [ + [ + "allasfamc.zip" + ] + ], + "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", + "size": null, + "urls": [ + "http://mocap.cs.cmu.edu/subjects" + ] + }, + "creep_rupture": { + "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", + "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", + "files": [ + [ + "creeprupt.tar" + ] + ], + "license": null, + "size": 602797, + "urls": [ + "http://www.msm.cam.ac.uk/map/data/tar/" + ] + }, + "decampos_characters": { + "citation": "T. de Campos, B. R. Babu, and M. Varma. Character recognition in natural images. VISAPP 2009.", + "details": "Examples of hand written digits taken from the de Campos et al paper on Character Recognition in Natural Images.", + "files": [ + [ + "characters.npy", + "digits.npy" + ] + ], + "license": null, + "size": 2031872, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/decampos_digits/" + ] + }, + "della_gatta": { + "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", + "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", + "files": [ + [ + "DellaGattadata.mat" + ] + ], + "license": null, + "size": 3729650, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/" + ] + }, + "epomeo_gpx": { + "citation": "", + "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", + "files": [ + [ + "endomondo_1.gpx", + "endomondo_2.gpx", + "garmin_watch_via_endomondo.gpx", + "viewranger_phone.gpx", + "viewranger_tablet.gpx" + ] + ], + "license": null, + "size": 2031872, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/" + ] + }, + "football_data": { + "citation": "", + "details": "Results of English football matches since 1993/94 season.", + "files": [ + [ + "E0.csv", + "E1.csv", + "E2.csv", + "E3.csv" + ] + ], + "license": null, + "size": 1, + "urls": [ + "http://www.football-data.co.uk/mmz4281/" + ] + }, + "fruitfly_tomancak": { + "citation": "'Systematic determination of patterns of gene expression during Drosophila embryogenesis' Pavel Tomancak, Amy Beaton, Richard Weiszmann, Elaine Kwan, ShengQiang Shu, Suzanna E Lewis, Stephen Richards, Michael Ashburner, Volker Hartenstein, Susan E Celniker, and Gerald M Rubin", + "details": "Gene expression results from blastoderm development in Drosophila Melanogaster.", + "files": [ + [ + "embryo_tc_4_1.CEL", + "embryo_tc_4_2.CEL", + "embryo_tc_4_3.CEL", + "embryo_tc_4_4.CEL", + "embryo_tc_4_5.CEL", + "embryo_tc_4_6.CEL", + "embryo_tc_4_7.CEL", + "embryo_tc_4_8.CEL", + "embryo_tc_4_9.CEL", + "embryo_tc_4_10.CEL", + "embryo_tc_4_11.CEL", + "embryo_tc_4_12.CEL", + "embryo_tc_6_1.CEL", + "embryo_tc_6_2.CEL", + "embryo_tc_6_3.CEL", + "embryo_tc_6_4.CEL", + "embryo_tc_6_5.CEL", + "embryo_tc_6_6.CEL", + "embryo_tc_6_7.CEL", + "embryo_tc_6_8.CEL", + "embryo_tc_6_9.CEL", + "embryo_tc_6_10.CEL", + "embryo_tc_6_11.CEL", + "embryo_tc_6_12.CEL", + "embryo_tc_8_1.CEL", + "embryo_tc_8_2.CEL", + "embryo_tc_8_3.CEL", + "embryo_tc_8_4.CEL", + "embryo_tc_8_5.CEL", + "embryo_tc_8_6.CEL", + "embryo_tc_8_7.CEL", + "embryo_tc_8_8.CEL", + "embryo_tc_8_9.CEL", + "embryo_tc_8_10.CEL", + "embryo_tc_8_11.CEL", + "embryo_tc_8_12.CEL", + "CG_AffyOligo_Gadfly3_01_13_03", + "embryo_tc_rma_release2.txt", + "embryo_tc_rma_release3.txt", + "na_affy_oligo.dros", + "README.TXT" + ] + ], + "license": null, + "size": 1, + "urls": [ + "ftp://ftp.fruitfly.org/pub/embryo_tc_array_data/" + ] + }, + "google_trends": { + "citation": "", + "details": "Google trends results.", + "files": [ + [ + + ] + ], + "license": null, + "size": 0, + "urls": [ + "http://www.google.com/trends/" + ] + }, + "hapmap3": { + "citation": "Gibbs, Richard A., et al. 'The international HapMap project.' Nature 426.6968 (2003): 789-796.", + "details": "HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\\\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", + "files": [ + [ + "hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", + "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", + "relationships_w_pops_121708.txt" + ] + ], + "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", + "size": 3458246739, + "urls": [ + "http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/" + ] + }, + "isomap_face_data": { + "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", + "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", + "files": [ + [ + "face_data.mat" + ] + ], + "license": null, + "size": 24229368, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/" + ] + }, + "mauna_loa": { + "citation": "Mauna Loa Data. Dr. Pieter Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/) and Dr. Ralph Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/).", + "details": "The 'average' column contains the monthly mean CO2 mole fraction determined from daily averages. The mole fraction of CO2, expressed as parts per million (ppm) is the number of molecules of CO2 in every one million molecules of dried air (water vapor removed). If there are missing days concentrated either early or late in the month, the monthly mean is corrected to the middle of the month using the average seasonal cycle. Missing months are denoted by -99.99. The 'interpolated' column includes average values from the preceding column and interpolated values where data are missing. Interpolated values are computed in two steps. First, we compute for each month the average seasonal cycle in a 7-year window around each monthly value. In this way the seasonal cycle is allowed to change slowly over time. We then determine the 'trend' value for each month by removing the seasonal cycle; this result is shown in the 'trend' column. Trend values are linearly interpolated for missing months. The interpolated monthly mean is then the sum of the average seasonal cycle value and the trend value for the missing month.\n\nNOTE: In general, the data presented for the last year are subject to change, depending on recalibration of the reference gas mixtures used, and other quality control procedures. Occasionally, earlier years may also be changed for the same reasons. Usually these changes are minor.\n\nCO2 expressed as a mole fraction in dry air, micromol/mol, abbreviated as ppm \n\n (-99.99 missing data; -1 no data for daily means in month)", + "files": [ + [ + "co2_mm_mlo.txt" + ] + ], + "license": "-------------------------------------------------------------------- USE OF NOAA ESRL DATA\n\n These data are made freely available to the public and the scientific community in the belief that their wide dissemination will lead to greater understanding and new scientific insights. The availability of these data does not constitute publication of the data. NOAA relies on the ethics and integrity of the user to insure that ESRL receives fair credit for their work. If the data are obtained for potential use in a publication or presentation, ESRL should be informed at the outset of the nature of this work. If the ESRL data are essential to the work, or if an important result or conclusion depends on the ESRL data, co-authorship may be appropriate. This should be discussed at an early stage in the work. Manuscripts using the ESRL data should be sent to ESRL for review before they are submitted for publication so we can insure that the quality and limitations of the data are accurately represented.\n\n Contact: Pieter Tans (303 497 6678; pieter.tans@noaa.gov)\n\n RECIPROCITY Use of these data implies an agreement to reciprocate. Laboratories making similar measurements agree to make their own data available to the general public and to the scientific community in an equally complete and easily accessible form. Modelers are encouraged to make available to the community, upon request, their own tools used in the interpretation of the ESRL data, namely well documented model code, transport fields, and additional information necessary for other scientists to repeat the work and to run modified versions. Model availability includes collaborative support for new users of the models.\n --------------------------------------------------------------------\n\n See www.esrl.noaa.gov/gmd/ccgg/trends/ for additional details.", + "size": 46779, + "urls": [ + "ftp://aftp.cmdl.noaa.gov/products/trends/co2/" + ] + }, + "olivetti_faces": { + "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", + "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", + "files": [ + [ + "att_faces.zip" + ], + [ + "olivettifaces.mat" + ] + ], + "license": null, + "size": 8561331, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", + "http://www.cs.nyu.edu/~roweis/data/" + ] + }, + "olivetti_glasses": { + "citation": "Information recorded in olivetti_faces entry. Should be used from there.", + "details": "Information recorded in olivetti_faces entry. Should be used from there.", + "files": [ + [ + "has_glasses.np" + ], + [ + "olivettifaces.mat" + ] + ], + "license": null, + "size": 4261047, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", + "http://www.cs.nyu.edu/~roweis/data/" + ] + }, + "olympic_marathon_men": { + "citation": null, + "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", + "files": [ + [ + "olympicMarathonTimes.csv" + ] + ], + "license": null, + "size": 584, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/" + ] + }, + "osu_accad": { + "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", + "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", + "files": [ + [ + "swagger1TXT.ZIP", + "handspring1TXT.ZIP", + "quickwalkTXT.ZIP", + "run1TXT.ZIP", + "sprintTXT.ZIP", + "dogwalkTXT.ZIP", + "camper_04TXT.ZIP", + "dance_KB3_TXT.ZIP", + "per20_TXT.ZIP", + "perTWO07_TXT.ZIP", + "perTWO13_TXT.ZIP", + "perTWO14_TXT.ZIP", + "perTWO15_TXT.ZIP", + "perTWO16_TXT.ZIP" + ], + [ + "connections.txt" + ] + ], + "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", + "size": 15922790, + "urls": [ + "http://accad.osu.edu/research/mocap/data/", + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" + ] + }, + "osu_run1": { + "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", + "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", + "files": [ + [ + "run1TXT.ZIP" + ], + [ + "connections.txt" + ] + ], + "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", + "size": 338103, + "urls": [ + "http://accad.osu.edu/research/mocap/data/", + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" + ] + }, + "pumadyn-32nm": { + "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", + "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", + "files": [ + [ + "pumadyn-32nm.tar.gz" + ] + ], + "license": "Data is made available by the Delve system at the University of Toronto", + "size": 5861646, + "urls": [ + "ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/" + ] + }, + "ripley_prnn_data": { + "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", + "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", + "files": [ + [ + "Cushings.dat", + "README", + "crabs.dat", + "fglass.dat", + "fglass.grp", + "pima.te", + "pima.tr", + "pima.tr2", + "synth.te", + "synth.tr", + "viruses.dat", + "virus3.dat" + ] + ], + "license": null, + "size": 93565, + "urls": [ + "http://www.stats.ox.ac.uk/pub/PRNN/" + ] + }, + "robot_wireless": { + "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", + "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", + "files": [ + [ + "uw-floor.txt" + ] + ], + "license": null, + "size": 284390, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/" + ] + }, + "rogers_girolami_data": { + "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", + "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", + "files": [ + [ + "firstcoursemldata.tar.gz" + ] + ], + "license": null, + "size": 21949154, + "suffices": [ + [ + "?dl=1" + ] + ], + "urls": [ + "https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/" + ] + }, + "singlecell": { + "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", + "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + "files": [ + [ + "singlecell.csv" + ] + ], + "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", + "size": 233.1, + "urls": [ + "http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/" + ] + }, + "swiss_roll": { + "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", + "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", + "files": [ + [ + "swiss_roll_data.mat" + ] + ], + "license": null, + "size": 800256, + "urls": [ + "http://isomap.stanford.edu/" + ] + }, + "three_phase_oil_flow": { + "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", + "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", + "files": [ + [ + "DataTrnLbls.txt", + "DataTrn.txt", + "DataTst.txt", + "DataTstLbls.txt", + "DataVdn.txt", + "DataVdnLbls.txt" + ] + ], + "license": null, + "size": 712796, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/" + ] + }, + "xw_pen": { + "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", + "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", + "files": [ + [ + "xw_pen_15.csv" + ] + ], + "license": null, + "size": 3410, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/" + ] + } +} diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 02c5cdb9..bdd55066 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -108,7 +108,11 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code)) with open(save_name, 'wb') as f: meta = response.info() - file_size = int(meta.getheaders("Content-Length")[0]) + content_length_str = meta.getheaders("Content-Length") + if content_length_str: + file_size = int(content_length_str[0]) + else: + file_size = 1e10 status = "" file_size_dl = 0 block_sz = 8192 @@ -350,6 +354,13 @@ def football_data(season='1314', data_set='football_data'): Y = table[:, 4:] return data_details_return({'X': X, 'Y': Y}, data_set) +def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): + if not data_available(data_set): + download_data(data_set) + X = None + Y = None + return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set) + # This will be for downloading google trends data. def google_trends(query_terms=['big data', 'machine learning', 'data science'], data_set='google_trends'): """Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations.""" diff --git a/GPy/util/datasets/data_resources_create.py b/GPy/util/datasets/data_resources_create.py deleted file mode 100644 index 919e3ea4..00000000 --- a/GPy/util/datasets/data_resources_create.py +++ /dev/null @@ -1,176 +0,0 @@ -import json - -neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' -sam_url = 'http://www.cs.nyu.edu/~roweis/data/' -cmu_url = 'http://mocap.cs.cmu.edu/subjects/' - -data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'], - 'files' : [['ankurDataPoseSilhouette.mat']], - 'license' : None, - 'citation' : """3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.""", - 'details' : """Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."""}, - - 'boston_housing' : {'urls' : ['http://archive.ics.uci.edu/ml/machine-learning-databases/housing/'], - 'files' : [['Index', 'housing.data', 'housing.names']], - 'citation' : """Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.""", - 'details' : """The Boston Housing data relates house values in Boston to a range of input variables.""", - 'license' : None, - 'size' : 51276 - }, - 'brendan_faces' : {'urls' : [sam_url], - 'files': [['frey_rawface.mat']], - 'citation' : 'Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.', - 'details' : """A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.""", - 'license': None, - 'size' : 1100584}, - 'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'], - 'files' : [['allasfamc.zip']], - 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.' - 'The database was created with funding from NSF EIA-0196217.""", - 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""", - 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""", - 'size' : None}, - 'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'], - 'files' : [['creeprupt.tar']], - 'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.', - 'details' : """Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.""", - 'license' : None, - 'size' : 602797}, - 'della_gatta' : {'urls' : [neil_url + 'della_gatta/'], - 'files': [['DellaGattadata.mat']], - 'citation' : 'Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008', - 'details': "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", - 'license':None, - 'size':3729650}, - 'epomeo_gpx' : {'urls' : [neil_url + 'epomeo_gpx/'], - 'files': [['endomondo_1.gpx', 'endomondo_2.gpx', 'garmin_watch_via_endomondo.gpx','viewranger_phone.gpx','viewranger_tablet.gpx']], - 'citation' : '', - 'details': "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", - 'license':None, - 'size': 2031872}, - 'three_phase_oil_flow': {'urls' : [neil_url + 'three_phase_oil_flow/'], - 'files' : [['DataTrnLbls.txt', 'DataTrn.txt', 'DataTst.txt', 'DataTstLbls.txt', 'DataVdn.txt', 'DataVdnLbls.txt']], - 'citation' : 'Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593', - 'details' : """The three phase oil data used initially for demonstrating the Generative Topographic mapping.""", - 'license' : None, - 'size' : 712796}, - 'rogers_girolami_data' : {'urls' : ['https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/'], - 'files' : [['firstcoursemldata.tar.gz']], - 'suffices' : [['?dl=1']], - 'citation' : 'A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146', - 'details' : """Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.""", - 'license' : None, - 'size' : 21949154}, - 'olivetti_faces' : {'urls' : [neil_url + 'olivetti_faces/', sam_url], - 'files' : [['att_faces.zip'], ['olivettifaces.mat']], - 'citation' : 'Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994', - 'details' : """Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. """, - 'license': None, - 'size' : 8561331}, - 'olympic_marathon_men' : {'urls' : [neil_url + 'olympic_marathon_men/'], - 'files' : [['olympicMarathonTimes.csv']], - 'citation' : None, - 'details' : """Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data""", - 'license': None, - 'size' : 584}, - 'osu_run1' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'], - 'files': [['run1TXT.ZIP'],['connections.txt']], - 'details' : "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", - 'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.', - 'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).', - 'size': 338103}, - 'osu_accad' : {'urls': ['http://accad.osu.edu/research/mocap/data/', neil_url + 'stick/'], - 'files': [['swagger1TXT.ZIP','handspring1TXT.ZIP','quickwalkTXT.ZIP','run1TXT.ZIP','sprintTXT.ZIP','dogwalkTXT.ZIP','camper_04TXT.ZIP','dance_KB3_TXT.ZIP','per20_TXT.ZIP','perTWO07_TXT.ZIP','perTWO13_TXT.ZIP','perTWO14_TXT.ZIP','perTWO15_TXT.ZIP','perTWO16_TXT.ZIP'],['connections.txt']], - 'details' : "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", - 'citation' : 'The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.', - 'license' : 'Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).', - 'size': 15922790}, - 'pumadyn-32nm' : {'urls' : ['ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/'], - 'files' : [['pumadyn-32nm.tar.gz']], - 'details' : """Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.""", - 'citation' : """Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.""", - 'license' : """Data is made available by the Delve system at the University of Toronto""", - 'size' : 5861646}, - 'robot_wireless' : {'urls' : [neil_url + 'robot_wireless/'], - 'files' : [['uw-floor.txt']], - 'citation' : """WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.""", - 'details' : """Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.""", - 'license' : None, - 'size' : 284390}, - 'swiss_roll' : {'urls' : ['http://isomap.stanford.edu/'], - 'files' : [['swiss_roll_data.mat']], - 'details' : """Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""", - 'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000', - 'license' : None, - 'size' : 800256}, - 'ripley_prnn_data' : {'urls' : ['http://www.stats.ox.ac.uk/pub/PRNN/'], - 'files' : [['Cushings.dat', 'README', 'crabs.dat', 'fglass.dat', 'fglass.grp', 'pima.te', 'pima.tr', 'pima.tr2', 'synth.te', 'synth.tr', 'viruses.dat', 'virus3.dat']], - 'details' : """Data sets from Brian Ripley's Pattern Recognition and Neural Networks""", - 'citation': """Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7""", - 'license' : None, - 'size' : 93565}, - 'isomap_face_data' : {'urls' : [neil_url + 'isomap_face_data/'], - 'files' : [['face_data.mat']], - 'details' : """Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.""", - 'citation' : 'A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000', - 'license' : None, - 'size' : 24229368}, - 'xw_pen' : {'urls' : [neil_url + 'xw_pen/'], - 'files' : [['xw_pen_15.csv']], - 'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""", - 'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005', - 'license' : None, - 'size' : 3410}, - 'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'], - 'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']], - 'details' : """ - HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. - The HapMap phase three SNP dataset - 1184 samples out of 11 populations. - See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details. - - SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: - Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then - - / 1, iff SNPij==(B1,B1) - Aij = | 0, iff SNPij==(B1,B2) - \ -1, iff SNPij==(B2,B2) - - The SNP data and the meta information (such as iid, sex and phenotype) are - stored in the dataframe datadf, index is the Individual ID, - with following columns for metainfo: - - * family_id -> Family ID - * paternal_id -> Paternal ID - * maternal_id -> Maternal ID - * sex -> Sex (1=male; 2=female; other=unknown) - * phenotype -> Phenotype (-9, or 0 for unknown) - * population -> Population string (e.g. 'ASW' - 'YRI') - * rest are SNP rs (ids) - - More information is given in infodf: - - * Chromosome: - - autosomal chromosemes -> 1-22 - - X X chromosome -> 23 - - Y Y chromosome -> 24 - - XY Pseudo-autosomal region of X -> 25 - - MT Mitochondrial -> 26 - * Relative Positon (to Chromosome) [base pairs] - - """, - 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""", - 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""", - 'size' : 2*1729092237 + 62265}, - - 'singlecell' : {'urls' : ["http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/"], - 'files' : [['singlecell.csv']], - 'details' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", - 'citation' : "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", - 'license' : "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", - 'size' : 233.1, - } - } - -with open('data_resources.json', 'w') as f: - print "writing data_resources" - json.dump(data_resources, f) From 5b8b3b2256c3fc1dd3404f8e2aace92b5525ba6c Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 11:42:53 +0100 Subject: [PATCH 20/43] [copy] handled hierarchy error for copying --- GPy/core/parameterization/lists_and_dicts.py | 21 ++++--- GPy/core/parameterization/param.py | 7 +++ GPy/core/parameterization/parameter_core.py | 64 +++++++++++++++----- 3 files changed, 68 insertions(+), 24 deletions(-) diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py index 604d0a01..13547c94 100644 --- a/GPy/core/parameterization/lists_and_dicts.py +++ b/GPy/core/parameterization/lists_and_dicts.py @@ -59,13 +59,14 @@ class ObservablesList(object): return self._poc.__repr__() def add(self, priority, observable, callble): - ins = 0 - for pr, _, _ in self: - if priority > pr: - break - ins += 1 - self._poc.insert(ins, (priority, weakref.ref(observable), callble)) - + if observable is not None: + ins = 0 + for pr, _, _ in self: + if priority > pr: + break + ins += 1 + self._poc.insert(ins, (priority, weakref.ref(observable), callble)) + def __str__(self): ret = [] curr_p = None @@ -96,8 +97,10 @@ class ObservablesList(object): def __deepcopy__(self, memo): self.flush() s = ObservablesList() - import copy - s._poc = copy.deepcopy(self._poc, memo) + for p,o,c in self._poc: + import copy + s.add(p, copy.deepcopy(o(), memo), copy.deepcopy(c, memo)) + s.flush() return s def __getstate__(self): diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 7055838a..1c67b9d9 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -156,6 +156,13 @@ class Param(OptimizationHandlable, ObsAr): def _ensure_fixes(self): if not self._has_fixes(): self._fixes_ = numpy.ones(self._realsize_, dtype=bool) + #=========================================================================== + # parameterizable + #=========================================================================== + def traverse(self, visit, *args, **kwargs): + visit(self, *args, **kwargs) + + #=========================================================================== # Convenience #=========================================================================== diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 68140763..93924678 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -17,7 +17,7 @@ from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, import numpy as np import re -__updated__ = '2014-04-16' +__updated__ = '2014-05-12' class HierarchyError(Exception): """ @@ -124,7 +124,7 @@ class Parentable(object): """ Disconnect this object from its parent """ - raise NotImplementedError, "Abstaract superclass" + raise NotImplementedError, "Abstract superclass" @property def _highest_parent_(self): @@ -162,14 +162,13 @@ class Pickleable(object): :param protocol: pickling protocol to use, python-pickle for details. """ import cPickle as pickle - import pickle #TODO: cPickle if isinstance(f, str): with open(f, 'w') as f: pickle.dump(self, f, protocol) else: pickle.dump(self, f, protocol) - #=========================================================================== + #=========================================================================== # copy and pickling #=========================================================================== def copy(self): @@ -177,7 +176,12 @@ class Pickleable(object): #raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy" import copy memo = {} - memo[id(self._parent_)] = None + parents = [] + self.traverse_parents(parents.append) + # remove self, which is the first arguments + parents = [p for p in parents if p is not self] + for p in parents: + memo[id(p)] = None memo[id(self.gradient)] = None memo[id(self.param_array)] = None memo[id(self._fixes_)] = None @@ -202,9 +206,6 @@ class Pickleable(object): dc = dict() for k,v in self.__dict__.iteritems(): if k not in ignore_list: - #if hasattr(v, "__getstate__"): - #dc[k] = v.__getstate__() - #else: dc[k] = v return dc @@ -212,12 +213,6 @@ class Pickleable(object): self.__dict__.update(state) return self - #def __getstate__(self, memo): - # raise NotImplementedError, "get state must be implemented to be able to pickle objects" - - #def __setstate__(self, memo): - # raise NotImplementedError, "set state must be implemented to be able to pickle objects" - class Gradcheckable(Pickleable, Parentable): """ Adds the functionality for an object to be gradcheckable. @@ -644,6 +639,7 @@ class OptimizationHandlable(Constrainable): else: names = [adjust(x.name) for x in self._parameters_] if add_self: names = map(lambda x: adjust(self.name) + "." + x, names) return names + def _get_param_names(self): n = np.array([p.hierarchy_name() + '[' + str(i) + ']' for p in self.flattened_parameters for i in p._indices()]) return n @@ -710,12 +706,14 @@ class Parameterizable(OptimizationHandlable): super(Parameterizable, self).__init__(*args, **kwargs) from GPy.core.parameterization.lists_and_dicts import ArrayList self._parameters_ = ArrayList() + self._param_array_ = None self.size = 0 self._added_names_ = set() + self.__visited = False # for traversing in reverse order we need to know if we were here already @property def param_array(self): - if not hasattr(self, '_param_array_'): + if self._param_array_ is None: self._param_array_ = np.empty(self.size, dtype=np.float64) return self._param_array_ @@ -723,6 +721,42 @@ class Parameterizable(OptimizationHandlable): def param_array(self, arr): self._param_array_ = arr + def traverse(self, visit, *args, **kwargs): + """ + Traverse the hierarchy performing visit(self, *args, **kwargs) at every node passed by. + See "visitor pattern" in literature. This is implemented in pre-order fashion. + + Example: + Collect all children: + + children = [] + self.traverse(children.append) + print children + """ + if not self.__visited: + visit(self, *args, **kwargs) + self.__visited = True + for c in self._parameters_: + c.traverse(visit, *args, **kwargs) + + def traverse_parents(self, visit, *args, **kwargs): + """ + Traverse the hierarchy upwards, visiting all parents and their children. + See "visitor pattern" in literature. This is implemented in pre-order fashion. + + Example: + + parents = [] + self.traverse_parents(parents.append) + print parents + """ + if not self.__visited: + visit(self, *args, **kwargs) + self.__visited = True + if self.has_parent(): + self._parent_.traverse_parents(visit, *args, **kwargs) + self._parent_.traverse(visit, *args, **kwargs) + self.__visited = False #========================================================================= # Gradient handling #========================================================================= From 5826ac6b734e382c8d81447289297555562b1e83 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 12:04:44 +0100 Subject: [PATCH 21/43] [kern] pow for kernels now in place again --- GPy/kern/_src/kern.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index 368a9c87..8982c87f 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -164,8 +164,8 @@ class Kern(Parameterized): """ Shortcut for tensor `prod`. """ - assert self.active_dims == range(self.input_dim), "Can only use kernels, which have their input_dims defined from 0" - assert other.active_dims == range(other.input_dim), "Can only use kernels, which have their input_dims defined from 0" + assert np.all(self.active_dims == range(self.input_dim)), "Can only use kernels, which have their input_dims defined from 0" + assert np.all(other.active_dims == range(other.input_dim)), "Can only use kernels, which have their input_dims defined from 0" other.active_dims += self.input_dim return self.prod(other) From 22221565bbcdfcb4d4d7f6576fe789cce813a95a Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 12:05:06 +0100 Subject: [PATCH 22/43] [visualize] some adjustments to vector_show --- GPy/examples/dimensionality_reduction.py | 6 +++--- GPy/plotting/matplot_dep/visualize.py | 16 ++++++++++------ 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index 8a31968e..ac1c50ee 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -161,6 +161,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, import GPy from matplotlib import pyplot as plt from ..util.misc import param_to_array + import numpy as np _np.random.seed(0) data = GPy.util.datasets.oil() @@ -174,11 +175,10 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, m.optimize('scg', messages=verbose, max_iters=max_iters, gtol=.05) if plot: - y = m.Y fig, (latent_axes, sense_axes) = plt.subplots(1, 2) m.plot_latent(ax=latent_axes, labels=m.data_labels) - data_show = GPy.plotting.matplot_dep.visualize.vector_show(y) - lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean), # @UnusedVariable + data_show = GPy.plotting.matplot_dep.visualize.vector_show(np.zeros((m.Y.shape[1], 1))) + lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) raw_input('Press enter to finish') plt.close(fig) diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py index fae05ff3..b26910c4 100644 --- a/GPy/plotting/matplot_dep/visualize.py +++ b/GPy/plotting/matplot_dep/visualize.py @@ -74,13 +74,17 @@ class vector_show(matplotlib_show): """ def __init__(self, vals, axes=None): matplotlib_show.__init__(self, vals, axes) - self.handle = self.axes.plot(np.arange(0, len(vals))[:, None], self.vals) + #assert vals.ndim == 2, "Please give a vector in [n x 1] to plot" + #assert vals.shape[1] == 1, "only showing a vector in one dimension" + self.size = vals.size + + self.handle = self.axes.plot(np.arange(0, vals.size)[:, None], self.vals)[0] def modify(self, vals): self.vals = vals.copy() - for handle, vals in zip(self.handle, self.vals.T): - xdata, ydata = handle.get_data() - handle.set_data(xdata, vals) + xdata, ydata = self.handle.get_data() + assert vals.size == self.size, "values passed into modify changed size! vals:{} != in:{}".format(vals.size, self.size) + self.handle.set_data(xdata, self.vals) self.axes.figure.canvas.draw() @@ -94,12 +98,12 @@ class lvm(matplotlib_show): :type data_visualize: visualize.data_show type. :param latent_axes: the axes where the latent visualization should be plotted. """ - if vals == None: + if vals is None: if isinstance(model.X, VariationalPosterior): vals = param_to_array(model.X.mean) else: vals = param_to_array(model.X) - + vals = param_to_array(vals) matplotlib_show.__init__(self, vals, axes=latent_axes) From ead4f6787a1c4cb25ee1d7079cd3f68ccc1f407b Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 14:40:34 +0100 Subject: [PATCH 23/43] [testing] minor --- GPy/testing/parameterized_tests.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py index fbdedc61..bc989637 100644 --- a/GPy/testing/parameterized_tests.py +++ b/GPy/testing/parameterized_tests.py @@ -27,11 +27,11 @@ class ArrayCoreTest(unittest.TestCase): class ParameterizedTest(unittest.TestCase): def setUp(self): - self.rbf = GPy.kern.RBF(1) + self.rbf = GPy.kern.RBF(20) self.white = GPy.kern.White(1) from GPy.core.parameterization import Param from GPy.core.parameterization.transformations import Logistic - self.param = Param('param', np.random.rand(25,2), Logistic(0, 1)) + self.param = Param('param', np.random.uniform(0,1,(25,2)), Logistic(0, 1)) self.test1 = GPy.core.Parameterized("test model") self.test1.param = self.param @@ -142,6 +142,8 @@ class ParameterizedTest(unittest.TestCase): self.testmodel.randomize() self.assertEqual(val, self.testmodel.kern.lengthscale) + + def test_regular_expression_misc(self): self.testmodel.kern.lengthscale.fix() val = float(self.testmodel.kern.lengthscale) From e9260b248cf4e9881333c43af126915fa1a3c2c1 Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 14:41:02 +0100 Subject: [PATCH 24/43] [visualize] vector show again --- GPy/examples/dimensionality_reduction.py | 4 ++-- GPy/plotting/matplot_dep/visualize.py | 5 ++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index ac1c50ee..43ba5937 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -177,7 +177,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, if plot: fig, (latent_axes, sense_axes) = plt.subplots(1, 2) m.plot_latent(ax=latent_axes, labels=m.data_labels) - data_show = GPy.plotting.matplot_dep.visualize.vector_show(np.zeros((m.Y.shape[1], 1))) + data_show = GPy.plotting.matplot_dep.visualize.vector_show((m.Y[0,:])) lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm_dimselect(param_to_array(m.X.mean)[0:1,:], # @UnusedVariable m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) raw_input('Press enter to finish') @@ -186,7 +186,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, def _simulate_sincos(D1, D2, D3, N, num_inducing, Q, plot_sim=False): _np.random.seed(1234) - + x = _np.linspace(0, 4 * _np.pi, N)[:, None] s1 = _np.vectorize(lambda x: _np.sin(x)) s2 = _np.vectorize(lambda x: _np.cos(x)**2) diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py index b26910c4..fb443de1 100644 --- a/GPy/plotting/matplot_dep/visualize.py +++ b/GPy/plotting/matplot_dep/visualize.py @@ -77,13 +77,12 @@ class vector_show(matplotlib_show): #assert vals.ndim == 2, "Please give a vector in [n x 1] to plot" #assert vals.shape[1] == 1, "only showing a vector in one dimension" self.size = vals.size - - self.handle = self.axes.plot(np.arange(0, vals.size)[:, None], self.vals)[0] + self.handle = self.axes.plot(np.arange(0, vals.size)[:, None], vals)[0] def modify(self, vals): self.vals = vals.copy() xdata, ydata = self.handle.get_data() - assert vals.size == self.size, "values passed into modify changed size! vals:{} != in:{}".format(vals.size, self.size) + assert vals.size == self.size, "values passed into modify changed size! vals.size:{} != in.size:{}".format(vals.size, self.size) self.handle.set_data(xdata, self.vals) self.axes.figure.canvas.draw() From c8da9602ecd3e54188ecaced8aa0b939d1ea2cfb Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Mon, 12 May 2014 14:47:07 +0100 Subject: [PATCH 25/43] [param] indexing now returns exactly like numpy arrays --- GPy/core/parameterization/observable_array.py | 6 +++--- GPy/core/parameterization/param.py | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py index a280d74f..5b8aa1cd 100644 --- a/GPy/core/parameterization/observable_array.py +++ b/GPy/core/parameterization/observable_array.py @@ -1,7 +1,7 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) -__updated__ = '2014-04-15' +__updated__ = '2014-05-12' import numpy as np from parameter_core import Observable, Pickleable @@ -15,10 +15,10 @@ class ObsAr(np.ndarray, Pickleable, Observable): """ __array_priority__ = -1 # Never give back ObsAr def __new__(cls, input_array, *a, **kw): + # allways make a copy of input paramters, as we need it to be in C order: if not isinstance(input_array, ObsAr): - obj = np.atleast_1d(np.require(input_array, dtype=np.float64, requirements=['W', 'C'])).view(cls) + obj = np.atleast_1d(np.require(np.copy(input_array), dtype=np.float64, requirements=['W', 'C'])).view(cls) else: obj = input_array - #cls.__name__ = "ObsAr" # because of fixed printing of `array` in np printing super(ObsAr, obj).__init__(*a, **kw) return obj diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 1c67b9d9..3ccbd169 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -45,7 +45,6 @@ class Param(OptimizationHandlable, ObsAr): _parameters_ = [] def __new__(cls, name, input_array, default_constraint=None): obj = numpy.atleast_1d(super(Param, cls).__new__(cls, input_array=input_array)) - cls.__name__ = "Param" obj._current_slice_ = (slice(obj.shape[0]),) obj._realshape_ = obj.shape obj._realsize_ = obj.size @@ -112,8 +111,8 @@ class Param(OptimizationHandlable, ObsAr): def __getitem__(self, s, *args, **kwargs): if not isinstance(s, tuple): s = (s,) - if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim: - s += (Ellipsis,) + #if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim: + # s += (Ellipsis,) new_arr = super(Param, self).__getitem__(s, *args, **kwargs) try: new_arr._current_slice_ = s; new_arr._original_ = self.base is new_arr.base except AttributeError: pass # returning 0d array or float, double etc From 0c2bae53f797c8a1e8c573a59e983d2b2a688fd6 Mon Sep 17 00:00:00 2001 From: marahman Date: Mon, 12 May 2014 18:45:23 +0100 Subject: [PATCH 26/43] Working with OU kernel --- GPy/kern/__init__.py | 2 +- GPy/kern/_src/stationary.py | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/GPy/kern/__init__.py b/GPy/kern/__init__.py index 1ed5e805..ab8fb523 100644 --- a/GPy/kern/__init__.py +++ b/GPy/kern/__init__.py @@ -3,7 +3,7 @@ from _src.rbf import RBF from _src.linear import Linear, LinearFull from _src.static import Bias, White from _src.brownian import Brownian -from _src.stationary import Exponential, Matern32, Matern52, ExpQuad, RatQuad, Cosine +from _src.stationary import Exponential, OU, Matern32, Matern52, ExpQuad, RatQuad, Cosine from _src.mlp import MLP from _src.periodic import PeriodicExponential, PeriodicMatern32, PeriodicMatern52 from _src.independent_outputs import IndependentOutputs, Hierarchical diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py index a560f8ad..f561baa4 100644 --- a/GPy/kern/_src/stationary.py +++ b/GPy/kern/_src/stationary.py @@ -192,6 +192,27 @@ class Exponential(Stationary): def dK_dr(self, r): return -0.5*self.K_of_r(r) + +class OU(Stationary): + """ + OU kernel: + + .. math:: + + k(r) = \\sigma^2 \exp(- r) \\ \\ \\ \\ \\text{ where } r = \sqrt{\sum_{i=1}^input_dim \\frac{(x_i-y_i)^2}{\ell_i^2} } + + """ + + def __init__(self, input_dim, variance=1., lengthscale=None, ARD=False, active_dims=None, name='OU'): + super(OU, self).__init__(input_dim, variance, lengthscale, ARD, active_dims, name) + + def K_of_r(self, r): + return self.variance * np.exp(-r) + + def dK_dr(self,r): + return -1.*self.variance*np.exp(-r) + + class Matern32(Stationary): """ Matern 3/2 kernel: From 2256127130102521f2b4b48cdc6b017d4f8458fc Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 05:23:36 +0100 Subject: [PATCH 27/43] Made openmp switch in only dependent on potion in rbf.py and linear.py --- GPy/kern/_src/linear.py | 51 +++++++++++++------ GPy/kern/_src/rbf.py | 42 ++++++++++----- .../matplot_dep/dim_reduction_plots.py | 2 +- GPy/plotting/matplot_dep/models_plots.py | 4 +- GPy/util/data_resources.json | 27 ++++++++-- GPy/util/datasets.py | 38 ++++++++++---- GPy/util/misc.py | 6 +-- 7 files changed, 122 insertions(+), 48 deletions(-) diff --git a/GPy/kern/_src/linear.py b/GPy/kern/_src/linear.py index f9dacf02..3f696431 100644 --- a/GPy/kern/_src/linear.py +++ b/GPy/kern/_src/linear.py @@ -12,6 +12,7 @@ from ...core.parameterization.transformations import Logexp from ...util.caching import Cache_this from ...core.parameterization import variational from psi_comp import linear_psi_comp +from ...util.config import * class Linear(Kern): """ @@ -224,12 +225,23 @@ class Linear(Kern): AZZA = ZA.T[:, None, :, None] * ZA[None, :, None, :] AZZA = AZZA + AZZA.swapaxes(1, 2) AZZA_2 = AZZA/2. + if config.getboolean('parallel', 'openmp'): + pragma_string = '#pragma omp parallel for private(m,mm,q,qq,factor,tmp)' + header_string = '#include ' + weave_options = {'headers' : [''], + 'extra_compile_args': ['-fopenmp -O3'], + 'extra_link_args' : ['-lgomp'], + 'libraries': ['gomp']} + else: + pragma_string = '' + header_string = '' + weave_options = {'extra_compile_args': ['-O3']} #Using weave, we can exploit the symmetry of this problem: code = """ int n, m, mm,q,qq; double factor,tmp; - #pragma omp parallel for private(m,mm,q,qq,factor,tmp) + %s for(n=0;n + %s #include - """ - weave_options = {'headers' : [''], - 'extra_compile_args': ['-fopenmp -O3'], #-march=native'], - 'extra_link_args' : ['-lgomp']} + """ % header_string mu = vp.mean N,num_inducing,input_dim,mu = mu.shape[0],Z.shape[0],mu.shape[1],param_to_array(mu) - weave.inline(code, support_code=support_code, libraries=['gomp'], + weave.inline(code, support_code=support_code, arg_names=['N','num_inducing','input_dim','mu','AZZA','AZZA_2','target_mu','target_S','dL_dpsi2'], type_converters=weave.converters.blitz,**weave_options) def _weave_dpsi2_dZ(self, dL_dpsi2, Z, vp, target): AZA = self.variances*self._ZAinner(vp, Z) + + if config.getboolean('parallel', 'openmp'): + pragma_string = '#pragma omp parallel for private(n,mm,q)' + header_string = '#include ' + weave_options = {'headers' : [''], + 'extra_compile_args': ['-fopenmp -O3'], + 'extra_link_args' : ['-lgomp'], + 'libraries': ['gomp']} + else: + pragma_string = '' + header_string = '' + weave_options = {'extra_compile_args': ['-O3']} + code=""" int n,m,mm,q; - #pragma omp parallel for private(n,mm,q) + %s for(m=0;m + %s #include - """ - weave_options = {'headers' : [''], - 'extra_compile_args': ['-fopenmp -O3'], #-march=native'], - 'extra_link_args' : ['-lgomp']} + """ % header_string N,num_inducing,input_dim = vp.mean.shape[0],Z.shape[0],vp.mean.shape[1] mu = param_to_array(vp.mean) - weave.inline(code, support_code=support_code, libraries=['gomp'], + weave.inline(code, support_code=support_code, arg_names=['N','num_inducing','input_dim','AZA','target','dL_dpsi2'], type_converters=weave.converters.blitz,**weave_options) diff --git a/GPy/kern/_src/rbf.py b/GPy/kern/_src/rbf.py index e0071fb9..5bc80871 100644 --- a/GPy/kern/_src/rbf.py +++ b/GPy/kern/_src/rbf.py @@ -10,6 +10,7 @@ from GPy.util.caching import Cache_this from ...core.parameterization import variational from psi_comp import ssrbf_psi_comp from psi_comp.ssrbf_psi_gpucomp import PSICOMP_SSRBF +from ...util.config import * class RBF(Stationary): """ @@ -231,6 +232,16 @@ class RBF(Stationary): @Cache_this(limit=1) def _psi2computations(self, Z, vp): + + if config.getboolean('parallel', 'openmp'): + pragma_string = '#pragma omp parallel for private(tmp, exponent_tmp)' + header_string = '#include ' + libraries = ['gomp'] + else: + pragma_string = '' + header_string = '' + libraries = [] + mu, S = vp.mean, vp.variance N, Q = mu.shape @@ -253,8 +264,7 @@ class RBF(Stationary): variance_sq = float(np.square(self.variance)) code = """ double tmp, exponent_tmp; - - #pragma omp parallel for private(tmp, exponent_tmp) + %s for (int n=0; n + %s #include - """ + """ % header_string mu = param_to_array(mu) - weave.inline(code, support_code=support_code, libraries=['gomp'], + weave.inline(code, support_code=support_code, libraries=libraries, arg_names=['N', 'M', 'Q', 'mu', 'Zhat', 'mudist_sq', 'mudist', 'denom_l2', 'Zdist_sq', 'half_log_denom', 'psi2', 'variance_sq'], type_converters=weave.converters.blitz, **self.weave_options) @@ -303,12 +313,20 @@ class RBF(Stationary): #return 2.*np.einsum( 'ijk,ijk,ijkl,il->l', dL_dpsi2, psi2, Zdist_sq * (2.*S[:,None,None,:]/l2 + 1.) + mudist_sq + S[:, None, None, :] / l2, 1./(2.*S + l2))*self.lengthscale result = np.zeros(self.input_dim) + if config.getboolean('parallel', 'openmp'): + pragma_string = '#pragma omp parallel for reduction(+:tmp)' + header_string = '#include ' + libraries = ['gomp'] + else: + pragma_string = '' + header_string = '' + libraries = [] code = """ double tmp; for(int q=0; q + %s #include - """ + """ % header_string N,Q = S.shape M = psi2.shape[-1] S = param_to_array(S) - weave.inline(code, support_code=support_code, libraries=['gomp'], + weave.inline(code, support_code=support_code, libraries=libraries, arg_names=['psi2', 'dL_dpsi2', 'N', 'M', 'Q', 'mudist_sq', 'l2', 'Zdist_sq', 'S', 'result'], type_converters=weave.converters.blitz, **self.weave_options) diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py index ca2c890f..71e08c6b 100644 --- a/GPy/plotting/matplot_dep/dim_reduction_plots.py +++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py @@ -97,7 +97,7 @@ def plot_latent(model, labels=None, which_indices=None, elif type(ul) is np.int64: this_label = 'class %i' % ul else: - this_label = 'class %i' % i + this_label = unicode(i) m = marker.next() index = np.nonzero(labels == ul)[0] diff --git a/GPy/plotting/matplot_dep/models_plots.py b/GPy/plotting/matplot_dep/models_plots.py index 57b64ae5..84747d05 100644 --- a/GPy/plotting/matplot_dep/models_plots.py +++ b/GPy/plotting/matplot_dep/models_plots.py @@ -14,7 +14,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all', which_data_ycols='all', fixed_inputs=[], levels=20, samples=0, fignum=None, ax=None, resolution=None, plot_raw=False, - linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None): + linecol=Tango.colorsHex['darkBlue'],fillcol=Tango.colorsHex['lightBlue'], Y_metadata=None, data_symbol='kx'): """ Plot the posterior of the GP. - In one dimension, the function is plotted with a shaded region identifying two standard deviations. @@ -97,7 +97,7 @@ def plot_fit(model, plot_limits=None, which_data_rows='all', for d in which_data_ycols: plots['gpplot'] = gpplot(Xnew, m[:, d], lower[:, d], upper[:, d], ax=ax, edgecol=linecol, fillcol=fillcol) - plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], 'kx', mew=1.5) + plots['dataplot'] = ax.plot(X[which_data_rows,free_dims], Y[which_data_rows, d], data_symbol, mew=1.5) #optionally plot some samples if samples: #NOTE not tested with fixed_inputs diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 51070650..6cc692e8 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -150,6 +150,26 @@ ] }, "fruitfly_tomancak": { + "citation": "", + "details": "", + "files": [ + [ + "tomancak_exprs.csv", + "tomancak_se.csv", + "tomancak_prctile5.csv", + "tomancak_prctile25.csv", + "tomancak_prctile50.csv", + "tomancak_prctile75.csv", + "tomancak_prctile95.csv" + ] + ], + "license": null, + "size": 59000000, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/fruitfly_tomancak/" + ] + }, + "fruitfly_tomancak_cel_files": { "citation": "'Systematic determination of patterns of gene expression during Drosophila embryogenesis' Pavel Tomancak, Amy Beaton, Richard Weiszmann, Elaine Kwan, ShengQiang Shu, Suzanna E Lewis, Stephen Richards, Michael Ashburner, Volker Hartenstein, Susan E Celniker, and Gerald M Rubin", "details": "Gene expression results from blastoderm development in Drosophila Melanogaster.", "files": [ @@ -198,7 +218,7 @@ ] ], "license": null, - "size": 1, + "size": 389000000, "urls": [ "ftp://ftp.fruitfly.org/pub/embryo_tc_array_data/" ] @@ -217,6 +237,7 @@ "http://www.google.com/trends/" ] }, + "hapmap3": { "citation": "Gibbs, Richard A., et al. 'The international HapMap project.' Nature 426.6968 (2003): 789-796.", "details": "HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. \n The HapMap phase three SNP dataset - 1184 samples out of 11 populations.\n See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.\n\n SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:\n Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then\n\n / 1, iff SNPij==(B1,B1)\n Aij = | 0, iff SNPij==(B1,B2)\n \\\\ -1, iff SNPij==(B2,B2)\n\n The SNP data and the meta information (such as iid, sex and phenotype) are\n stored in the dataframe datadf, index is the Individual ID, \n with following columns for metainfo:\n\n * family_id -> Family ID\n * paternal_id -> Paternal ID\n * maternal_id -> Maternal ID\n * sex -> Sex (1=male; 2=female; other=unknown)\n * phenotype -> Phenotype (-9, or 0 for unknown)\n * population -> Population string (e.g. 'ASW' - 'YRI')\n * rest are SNP rs (ids)\n\n More information is given in infodf:\n\n * Chromosome:\n - autosomal chromosemes -> 1-22\n - X X chromosome -> 23\n - Y Y chromosome -> 24\n - XY Pseudo-autosomal region of X -> 25\n - MT Mitochondrial -> 26\n * Relative Positon (to Chromosome) [base pairs]\n\n ", @@ -434,7 +455,7 @@ }, "singlecell": { "citation": "Guoji Guo, Mikael Huss, Guo Qing Tong, Chaoyang Wang, Li Li Sun, Neil D. Clarke, Paul Robson, Resolution of Cell Fate Decisions Revealed by Single-Cell Gene Expression Analysis from Zygote to Blastocyst, Developmental Cell, Volume 18, Issue 4, 20 April 2010, Pages 675-685, ISSN 1534-5807, http://dx.doi.org/10.1016/j.devcel.2010.02.012. (http://www.sciencedirect.com/science/article/pii/S1534580710001103) Keywords: DEVBIO", - "details": "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + "details": "qPCR TaqMan array single cell experiment in mouse. The data is taken from the early stages of development when the Blastocyst is forming. At the 32 cell stage the data is already separated into the trophectoderm (TE) which goes onto form the placenta and the inner cellular mass (ICM). The ICM further differentiates into the epiblast (EPI)---which gives rise to the endoderm, mesoderm and ectoderm---and the primitive endoderm (PE) which develops into the amniotic sack. Guo et al selected 48 genes for expression measurement. They labelled the resulting cells and their labels are included as an aide to visualization.", "files": [ [ "singlecell.csv" @@ -443,7 +464,7 @@ "license": "ScienceDirect: http://www.elsevier.com/locate/termsandconditions?utm_source=sciencedirect&utm_medium=link&utm_campaign=terms", "size": 233.1, "urls": [ - "http://staffwww.dcs.sheffield.ac.uk/people/M.Zwiessele/data/singlecell/" + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/" ] }, "swiss_roll": { diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index bdd55066..c18431ef 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -112,7 +112,7 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix if content_length_str: file_size = int(content_length_str[0]) else: - file_size = 1e10 + file_size = None status = "" file_size_dl = 0 block_sz = 8192 @@ -124,9 +124,15 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix file_size_dl += len(buff) f.write(buff) sys.stdout.write(" "*(len(status)) + "\r") - status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), - full=file_size/(1.*1e6), ll=line_length, + if file_size: + status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1048576.), + full=file_size/(1048576.), ll=line_length, perc="="*int(line_length*float(file_size_dl)/file_size)) + else: + status = r"[{perc: <{ll}}] {dl:7.3f}MB".format(dl=file_size_dl/(1048576.), + ll=line_length, + perc="."*int(line_length*float(file_size_dl/(10*1048576.)))) + sys.stdout.write(status) sys.stdout.flush() sys.stdout.write(" "*(len(status)) + "\r") @@ -357,8 +363,15 @@ def football_data(season='1314', data_set='football_data'): def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): if not data_available(data_set): download_data(data_set) - X = None - Y = None + from pandas import read_csv + filename = os.path.join(data_path, 'tomancak_expr.csv') + Y = read_csv(filename, header=0, index_col=0).T + num_repeats = 3 + num_time = 12 + xt = np.linspace(0, num_time-1, num_time) + xr = np.linspace(0, num_repeats-1, num_repeats) + xtime, xrepeat = np.meshgrid(xt, xr) + X = np.vstack((xtime.flatten(), xrepeat.flatten())).T return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set) # This will be for downloading google trends data. @@ -732,13 +745,16 @@ def hapmap3(data_set='hapmap3'): def singlecell(data_set='singlecell'): if not data_available(data_set): download_data(data_set) + + from pandas import read_csv dirpath = os.path.join(data_path, data_set) - data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) - genes = data[0, 1:] - labels = data[1:, 0] - Y = np.array(data[1:, 1:], dtype=float) - return data_details_return({'Y': Y, 'info' : "qPCR Singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", - 'genes':genes, 'labels':labels, + filename = os.path.join(dirpath, 'singlecell.csv') + Y = read_csv(filename, header=0, index_col=0) + genes = Y.columns + labels = Y.index + # data = np.loadtxt(os.path.join(dirpath, 'singlecell.csv'), delimiter=",", dtype=str) + return data_details_return({'Y': Y, 'info' : "qPCR singlecell experiment in Mouse, measuring 48 gene expressions in 1-64 cell states. The labels have been created as in Guo et al. [2010]", + 'genes': genes, 'labels':labels, }, data_set) def swiss_roll_1000(): diff --git a/GPy/util/misc.py b/GPy/util/misc.py index dc327324..fa9bb24c 100644 --- a/GPy/util/misc.py +++ b/GPy/util/misc.py @@ -130,14 +130,14 @@ def fast_array_equal(A, B): """ % pragma_string if config.getboolean('parallel', 'openmp'): - pragma_string = '#include ' + header_string = '#include ' else: - pragma_string = '' + header_string = '' support_code = """ %s #include - """ % pragma_string + """ % header_string weave_options_openmp = {'headers' : [''], From 0acb196b2637b9b76600b9236eb96b74948812c6 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 08:36:28 +0100 Subject: [PATCH 28/43] [examples] stick man example corrected --- GPy/examples/dimensionality_reduction.py | 27 +++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index 43ba5937..71610702 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -408,13 +408,13 @@ def stick(kernel=None, optimize=True, verbose=True, plot=True): data = GPy.util.datasets.osu_run1() # optimize m = GPy.models.GPLVM(data['Y'], 2, kernel=kernel) - if optimize: m.optimize(messages=verbose, max_f_eval=10000) + if optimize: m.optimize('bfgs', messages=verbose, max_f_eval=10000) if plot: plt.clf ax = m.plot_latent() y = m.Y[0, :] data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect']) - vis = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, latent_axes=ax) + vis = GPy.plotting.matplot_dep.visualize.lvm(m.X[:1, :].copy(), m, data_show, latent_axes=ax) raw_input('Press enter to finish') return m @@ -475,23 +475,30 @@ def robot_wireless(optimize=True, verbose=True, plot=True): def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True): from GPy.models import BayesianGPLVM from matplotlib import pyplot as plt + import numpy as np import GPy data = GPy.util.datasets.osu_run1() Q = 6 - kernel = GPy.kern.RBF(Q, ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + GPy.kern.White(Q, _np.exp(-2)) + kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel) + + m.data = data + + m.X.mean -= m.X.mean.mean(0); m.X.mean /= m.X.mean.var(0) + m.X.variance /= 100 + m.likelihood.variance = 0.001 + m.Z.randomize() + # optimize - m.ensure_default_constraints() - if optimize: m.optimize('scg', messages=verbose, max_iters=200, xtol=1e-300, ftol=1e-300) - m._set_params(m._get_params()) + if optimize: m.optimize('bfgs', messages=verbose, max_iters=1500, xtol=1e-300, ftol=1e-300) if plot: plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2) plt.sca(latent_axes) - m.plot_latent() - y = m.likelihood.Y[0, :].copy() - data_show = GPy.plotting.matplot_dep.visualize.stick_show(y[None, :], connect=data['connect']) - GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X[0, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) + m.plot_latent(ax=latent_axes) + y = m.Y[:1, :].copy() + data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect']) + GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) raw_input('Press enter to finish') return m From 4590a05d0fc938d02ef99c3c32487f50d7455f45 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 08:36:11 +0100 Subject: [PATCH 29/43] [visualize] minor --- GPy/plotting/matplot_dep/visualize.py | 1 - 1 file changed, 1 deletion(-) diff --git a/GPy/plotting/matplot_dep/visualize.py b/GPy/plotting/matplot_dep/visualize.py index fb443de1..6abd3872 100644 --- a/GPy/plotting/matplot_dep/visualize.py +++ b/GPy/plotting/matplot_dep/visualize.py @@ -103,7 +103,6 @@ class lvm(matplotlib_show): else: vals = param_to_array(model.X) - vals = param_to_array(vals) matplotlib_show.__init__(self, vals, axes=latent_axes) if isinstance(latent_axes,mpl.axes.Axes): From 4f627c904fa5d008f308421ae5636336d0042d5f Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 08:35:25 +0100 Subject: [PATCH 30/43] [variational] posterior object copies adjusted --- GPy/core/parameterization/param.py | 16 +++++++++++++--- GPy/core/parameterization/variational.py | 3 +++ 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 3ccbd169..19e48d84 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -89,6 +89,13 @@ class Param(OptimizationHandlable, ObsAr): def param_array(self): return self + @property + def values(self): + """ + Return self as numpy array view + """ + return self.view(np.ndarray) + @property def gradient(self): """ @@ -99,11 +106,11 @@ class Param(OptimizationHandlable, ObsAr): """ if getattr(self, '_gradient_array_', None) is None: self._gradient_array_ = numpy.empty(self._realshape_, dtype=numpy.float64) - return self._gradient_array_[self._current_slice_] + return self._gradient_array_#[self._current_slice_] @gradient.setter def gradient(self, val): - self._gradient_array_[self._current_slice_] = val + self._gradient_array_[:] = val #=========================================================================== # Array operations -> done @@ -114,7 +121,10 @@ class Param(OptimizationHandlable, ObsAr): #if not reduce(lambda a, b: a or numpy.any(b is Ellipsis), s, False) and len(s) <= self.ndim: # s += (Ellipsis,) new_arr = super(Param, self).__getitem__(s, *args, **kwargs) - try: new_arr._current_slice_ = s; new_arr._original_ = self.base is new_arr.base + try: + new_arr._current_slice_ = s + new_arr._gradient_array_ = self.gradient[s] + new_arr._original_ = self.base is new_arr.base except AttributeError: pass # returning 0d array or float, double etc return new_arr diff --git a/GPy/core/parameterization/variational.py b/GPy/core/parameterization/variational.py index 3730baed..044d1592 100644 --- a/GPy/core/parameterization/variational.py +++ b/GPy/core/parameterization/variational.py @@ -100,6 +100,9 @@ class VariationalPosterior(Parameterized): n.__dict__.update(dc) n._parameters_[dc['mean']._parent_index_] = dc['mean'] n._parameters_[dc['variance']._parent_index_] = dc['variance'] + n._gradient_array_ = None + oversize = self.size - self.mean.size - self.variance.size + n.size = n.mean.size + n.variance.size + oversize n.ndim = n.mean.ndim n.shape = n.mean.shape n.num_data = n.mean.shape[0] From 4b1577178c3fe23dfe2a31c2a30223b39fb87921 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 08:33:26 +0100 Subject: [PATCH 31/43] [init] now returns normalized values --- GPy/util/initialization.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/GPy/util/initialization.py b/GPy/util/initialization.py index 22e63b6b..8d23b541 100644 --- a/GPy/util/initialization.py +++ b/GPy/util/initialization.py @@ -13,7 +13,11 @@ def initialize_latent(init, input_dim, Y): p = pca(Y) PC = p.project(Y, min(input_dim, Y.shape[1])) Xr[:PC.shape[0], :PC.shape[1]] = PC + vars = p.fracs[:input_dim] else: - var = Xr.var(0) - return Xr, var/var.max() - return Xr, p.fracs[:input_dim] \ No newline at end of file + vars = Xr.var(0) + + Xr -= Xr.mean(0) + Xr /= Xr.var(0) + + return Xr, vars/vars.max() \ No newline at end of file From 53ff580a6ecf19437585ec86ab9b18e95ffb3663 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 09:03:57 +0100 Subject: [PATCH 32/43] [stick] bgplvm example now working --- GPy/examples/dimensionality_reduction.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index 71610702..a15c2a93 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -480,18 +480,14 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True): data = GPy.util.datasets.osu_run1() Q = 6 - kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True) + GPy.kern.Bias(Q, _np.exp(-2)) + kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True) m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel) m.data = data - - m.X.mean -= m.X.mean.mean(0); m.X.mean /= m.X.mean.var(0) - m.X.variance /= 100 m.likelihood.variance = 0.001 - m.Z.randomize() # optimize - if optimize: m.optimize('bfgs', messages=verbose, max_iters=1500, xtol=1e-300, ftol=1e-300) + if optimize: m.optimize('bfgs', messages=verbose, max_iters=800, xtol=1e-300, ftol=1e-300) if plot: plt.clf, (latent_axes, sense_axes) = plt.subplots(1, 2) plt.sca(latent_axes) @@ -499,7 +495,8 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True): y = m.Y[:1, :].copy() data_show = GPy.plotting.matplot_dep.visualize.stick_show(y, connect=data['connect']) GPy.plotting.matplot_dep.visualize.lvm_dimselect(m.X.mean[:1, :].copy(), m, data_show, latent_axes=latent_axes, sense_axes=sense_axes) - raw_input('Press enter to finish') + plt.draw() + #raw_input('Press enter to finish') return m @@ -516,7 +513,7 @@ def cmu_mocap(subject='35', motion=['01'], in_place=True, optimize=True, verbose if optimize: m.optimize(messages=verbose, max_f_eval=10000) if plot: ax = m.plot_latent() - y = m.likelihood.Y[0, :] + y = m.Y[0, :] data_show = GPy.plotting.matplot_dep.visualize.skeleton_show(y[None, :], data['skel']) lvm_visualizer = GPy.plotting.matplot_dep.visualize.lvm(m.X[0, :].copy(), m, data_show, ax) raw_input('Press enter to finish') From 8ff4a42d1a392906390edde8a6f7169a595cb7d1 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 09:32:58 +0100 Subject: [PATCH 33/43] [param_array] doc --- GPy/core/parameterization/parameter_core.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 93924678..5113b8d9 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -713,6 +713,10 @@ class Parameterizable(OptimizationHandlable): @property def param_array(self): + """ + Array representing the parameters of this class. + There is only one copy of all parameters in memory, two during optimization. + """ if self._param_array_ is None: self._param_array_ = np.empty(self.size, dtype=np.float64) return self._param_array_ From f110bbd4c8e96908a77b5d3c93635051a83373dd Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 09:33:35 +0100 Subject: [PATCH 34/43] [bgplvm] init lengthscale as 0./var --- GPy/models/bayesian_gplvm.py | 2 +- GPy/util/initialization.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/GPy/models/bayesian_gplvm.py b/GPy/models/bayesian_gplvm.py index 03cd361c..2bcbe0b2 100644 --- a/GPy/models/bayesian_gplvm.py +++ b/GPy/models/bayesian_gplvm.py @@ -42,7 +42,7 @@ class BayesianGPLVM(SparseGP): assert Z.shape[1] == X.shape[1] if kernel is None: - kernel = kern.RBF(input_dim, lengthscale=fracs, ARD=True) # + kern.white(input_dim) + kernel = kern.RBF(input_dim, lengthscale=1./fracs, ARD=True) # + kern.white(input_dim) if likelihood is None: likelihood = Gaussian() diff --git a/GPy/util/initialization.py b/GPy/util/initialization.py index 8d23b541..dd3b6ec7 100644 --- a/GPy/util/initialization.py +++ b/GPy/util/initialization.py @@ -13,11 +13,11 @@ def initialize_latent(init, input_dim, Y): p = pca(Y) PC = p.project(Y, min(input_dim, Y.shape[1])) Xr[:PC.shape[0], :PC.shape[1]] = PC - vars = p.fracs[:input_dim] + var = p.fracs[:input_dim] else: - vars = Xr.var(0) + var = Xr.var(0) Xr -= Xr.mean(0) Xr /= Xr.var(0) - return Xr, vars/vars.max() \ No newline at end of file + return Xr, var/var.max() \ No newline at end of file From db644408ea74858a8f23bd79a10d48b1b46dc39d Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 12:17:42 +0100 Subject: [PATCH 35/43] Add ordinal and attempt to fix downloads --- GPy/gpy_config.cfg | 4 ++ GPy/likelihoods/ordinal.py | 48 +++++++++++++++++++ .../matplot_dep/dim_reduction_plots.py | 2 +- GPy/util/data_resources.json | 15 ++++++ GPy/util/datasets.py | 21 +++++++- 5 files changed, 87 insertions(+), 3 deletions(-) create mode 100644 GPy/likelihoods/ordinal.py diff --git a/GPy/gpy_config.cfg b/GPy/gpy_config.cfg index db90dbf6..43cd0ebe 100644 --- a/GPy/gpy_config.cfg +++ b/GPy/gpy_config.cfg @@ -6,6 +6,10 @@ # some platforms, hence this option. openmp=False +[datasets] +# location for the local data cache +dir=$HOME/tmp/GPy-datasets/ + [anaconda] # if you have an anaconda python installation please specify it here. installed = False diff --git a/GPy/likelihoods/ordinal.py b/GPy/likelihoods/ordinal.py new file mode 100644 index 00000000..4ac204fd --- /dev/null +++ b/GPy/likelihoods/ordinal.py @@ -0,0 +1,48 @@ +# Copyright (c) 2014 The GPy authors (see AUTHORS.txt) +# Licensed under the BSD 3-clause license (see LICENSE.txt) + + +import sympy as sym +from GPy.util.symbolic import gammaln, normcdfln, normcdf, IndMatrix, create_matrix +import numpy as np +from ..util.univariate_Gaussian import std_norm_pdf, std_norm_cdf +import link_functions +from symbolic import Symbolic +from scipy import stats + +class Ordinal(Symbolic): + """ + Ordinal + + .. math:: + p(y_{i}|\pi(f_{i})) = \left(\frac{r}{r+f_i}\right)^r \frac{\Gamma(r+y_i)}{y!\Gamma(r)}\left(\frac{f_i}{r+f_i}\right)^{y_i} + + .. Note:: + Y takes non zero integer values.. + link function should have a positive domain, e.g. log (default). + + .. See also:: + symbolic.py, for the parent class + """ + def __init__(self, categories=3, gp_link=None): + if gp_link is None: + gp_link = link_functions.Identity() + + dispersion = sym.Symbol('width', positive=True, real=True) + y_0 = sym.Symbol('y_0', nonnegative=True, integer=True) + f_0 = sym.Symbol('f_0', positive=True, real=True) + log_pdf = create_matrix('log_pdf', 1, categories) + log_pdf[0] = normcdfln(-f_0) + if categories>2: + w = create_matrix('w', 1, categories) + log_pdf[categories-1] = normcdfln(w.sum() + f_0) + for i in range(1, categories-1): + log_pdf[i] = sym.log(normcdf(w[0, 0:i-1].sum() + f_0) - normcdf(w[0, 0:i].sum()-f_0) ) + else: + log_pdf[1] = normcdfln(f_0) + log_pdf.index_var = y_0 + super(Ordinal, self).__init__(log_pdf=log_pdf, gp_link=gp_link, name='Ordinal') + + # TODO: Check this. + self.log_concave = True + diff --git a/GPy/plotting/matplot_dep/dim_reduction_plots.py b/GPy/plotting/matplot_dep/dim_reduction_plots.py index 71e08c6b..f8413671 100644 --- a/GPy/plotting/matplot_dep/dim_reduction_plots.py +++ b/GPy/plotting/matplot_dep/dim_reduction_plots.py @@ -97,7 +97,7 @@ def plot_latent(model, labels=None, which_indices=None, elif type(ul) is np.int64: this_label = 'class %i' % ul else: - this_label = unicode(i) + this_label = unicode(ul) m = marker.next() index = np.nonzero(labels == ul)[0] diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 6cc692e8..d6640295 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -467,6 +467,21 @@ "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/" ] }, + "sod1_mouse": { + "citation": "Transcriptomic indices of fast and slow disease progression in two mouse models of amyotrophic lateral sclerosis' Nardo G1, Iennaco R, Fusi N, Heath PR, Marino M, Trolese MC, Ferraiuolo L, Lawrence N, Shaw PJ, Bendotti C Brain. 2013 Nov;136(Pt 11):3305-32. doi: 10.1093/brain/awt250. Epub 2013 Sep 24.", + "details": "Gene expression data from two separate strains of mice: C57 and 129Sv in wild type and SOD1 mutant strains.", + "files": [ + [ + "sod1_C59_129_exprs.csv", + "sod1_C59_129_se.csv" + ] + ], + "license": null, + "size": 0, + "urls": [ + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/sod1_mouse/" + ] + }, "swiss_roll": { "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index c18431ef..05e4013e 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -12,6 +12,8 @@ import datetime import json import re +from config import * + ipython_available=True try: import IPython @@ -29,7 +31,8 @@ def reporthook(a,b,c): sys.stdout.flush() # Global variables -data_path = os.path.join(os.path.dirname(__file__), 'datasets') +data_path = os.path.expandvar(config.get('datasets', 'dir')) +#data_path = os.path.join(os.path.dirname(__file__), 'datasets') default_seed = 10000 overide_manual_authorize=False neil_url = 'http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/' @@ -360,11 +363,25 @@ def football_data(season='1314', data_set='football_data'): Y = table[:, 4:] return data_details_return({'X': X, 'Y': Y}, data_set) +def sod1_mouse(data_set='sod1_mouse'): + if not data_available(data_set): + download_data(data_set) + from pandas import read_csv + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'sod1_C57_129_exprs.csv') + Y = read_csv(filename, header=0, index_col=0).T + num_repeats=4 + num_time=4 + num_cond=4 + X = 1 + return data_details_return({'X': X, 'Y': Y}, data_set) + def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): if not data_available(data_set): download_data(data_set) from pandas import read_csv - filename = os.path.join(data_path, 'tomancak_expr.csv') + dirpath = os.path.join(data_path, data_set) + filename = os.path.join(dirpath, 'tomancak_expr.csv') Y = read_csv(filename, header=0, index_col=0).T num_repeats = 3 num_time = 12 From 851e63476ce7ca186c97bb1e6a68ed420f35f715 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 12:53:42 +0100 Subject: [PATCH 36/43] [pydot] build pydot with new observer list --- GPy/core/parameterization/lists_and_dicts.py | 1 + GPy/core/parameterization/param.py | 4 ++-- GPy/core/parameterization/parameterized.py | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py index 13547c94..64bdb077 100644 --- a/GPy/core/parameterization/lists_and_dicts.py +++ b/GPy/core/parameterization/lists_and_dicts.py @@ -57,6 +57,7 @@ class ObservablesList(object): def __repr__(self): return self._poc.__repr__() + def add(self, priority, observable, callble): if observable is not None: diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 19e48d84..91bf3561 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -57,9 +57,9 @@ class Param(OptimizationHandlable, ObsAr): def build_pydot(self,G): import pydot - node = pydot.Node(id(self), shape='record', label=self.name) + node = pydot.Node(id(self), shape='trapezium', label=self.name)#, fontcolor='white', color='white') G.add_node(node) - for o in self.observers.keys(): + for _, o, _ in self.observers: label = o.name if hasattr(o, 'name') else str(o) observed_node = pydot.Node(id(o), label=label) G.add_node(observed_node) diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py index 738f0485..67694a1b 100644 --- a/GPy/core/parameterization/parameterized.py +++ b/GPy/core/parameterization/parameterized.py @@ -82,15 +82,15 @@ class Parameterized(Parameterizable): import pydot # @UnresolvedImport iamroot = False if G is None: - G = pydot.Dot(graph_type='digraph') + G = pydot.Dot(graph_type='digraph', bgcolor=None) iamroot=True - node = pydot.Node(id(self), shape='record', label=self.name) + node = pydot.Node(id(self), shape='box', label=self.name)#, color='white') G.add_node(node) for child in self._parameters_: child_node = child.build_pydot(G) - G.add_edge(pydot.Edge(node, child_node)) + G.add_edge(pydot.Edge(node, child_node))#, color='white')) - for o in self.observers.keys(): + for _, o, _ in self.observers: label = o.name if hasattr(o, 'name') else str(o) observed_node = pydot.Node(id(o), label=label) G.add_node(observed_node) From 442bc3f58199678e26e03b37a865ea1bb3975880 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Tue, 13 May 2014 14:20:59 +0100 Subject: [PATCH 37/43] [paramcore] fix for traversal --- GPy/core/parameterization/parameter_core.py | 22 ++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 5113b8d9..ba85be03 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -599,14 +599,13 @@ class OptimizationHandlable(Constrainable): return p def _set_params_transformed(self, p): - if p is self.param_array: - p = p.copy() - if self.has_parent() and self.constraints[__fixed__].size != 0: - fixes = np.ones(self.size).astype(bool) - fixes[self.constraints[__fixed__]] = FIXED - self.param_array.flat[fixes] = p - elif self._has_fixes(): self.param_array.flat[self._fixes_] = p - else: self.param_array.flat = p + if not(p is self.param_array): + if self.has_parent() and self.constraints[__fixed__].size != 0: + fixes = np.ones(self.size).astype(bool) + fixes[self.constraints[__fixed__]] = FIXED + self.param_array.flat[fixes] = p + elif self._has_fixes(): self.param_array.flat[self._fixes_] = p + else: self.param_array.flat = p self.untransform() self._trigger_params_changed() @@ -621,7 +620,7 @@ class OptimizationHandlable(Constrainable): def num_params(self): """ Return the number of parameters of this parameter_handle. - Param objects will allways return 0. + Param objects will always return 0. """ raise NotImplemented, "Abstract, please implement in respective classes" @@ -742,14 +741,15 @@ class Parameterizable(OptimizationHandlable): self.__visited = True for c in self._parameters_: c.traverse(visit, *args, **kwargs) + self.__visited = False def traverse_parents(self, visit, *args, **kwargs): """ Traverse the hierarchy upwards, visiting all parents and their children. See "visitor pattern" in literature. This is implemented in pre-order fashion. - + Example: - + parents = [] self.traverse_parents(parents.append) print parents From 2953e6b73b02c98db81c0ea0d7521bb564c9029e Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 17:00:02 +0100 Subject: [PATCH 38/43] Add ordinal and attempt to fix downloads --- GPy/util/datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 05e4013e..9f8e1938 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -31,7 +31,7 @@ def reporthook(a,b,c): sys.stdout.flush() # Global variables -data_path = os.path.expandvar(config.get('datasets', 'dir')) +data_path = os.path.expandvars(config.get('datasets', 'dir')) #data_path = os.path.join(os.path.dirname(__file__), 'datasets') default_seed = 10000 overide_manual_authorize=False From afc74b02cec98a8045e3a9ba864598923c2046c7 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 17:02:24 +0100 Subject: [PATCH 39/43] Sod1 Download --- GPy/util/data_resources.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index d6640295..58c2157c 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -479,7 +479,7 @@ "license": null, "size": 0, "urls": [ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/singlecell/sod1_mouse/" + "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/sod1_mouse/" ] }, "swiss_roll": { From 62920c2811cbce58f9935cf9e900cb07d472d7c8 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 17:08:51 +0100 Subject: [PATCH 40/43] Made openmp switch in only dependent on potion in rbf.py and linear.py --- GPy/util/data_resources.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 58c2157c..61050f9d 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -472,8 +472,8 @@ "details": "Gene expression data from two separate strains of mice: C57 and 129Sv in wild type and SOD1 mutant strains.", "files": [ [ - "sod1_C59_129_exprs.csv", - "sod1_C59_129_se.csv" + "sod1_C57_129_exprs.csv", + "sod1_C57_129_se.csv" ] ], "license": null, From cff37293d9e0d1fdd6a655ff5e64e84425fb0d28 Mon Sep 17 00:00:00 2001 From: Neil Lawrence Date: Tue, 13 May 2014 17:20:57 +0100 Subject: [PATCH 41/43] Fixing fruitfly_tomancak data load. --- GPy/util/datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 9f8e1938..81a6fabd 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -369,7 +369,7 @@ def sod1_mouse(data_set='sod1_mouse'): from pandas import read_csv dirpath = os.path.join(data_path, data_set) filename = os.path.join(dirpath, 'sod1_C57_129_exprs.csv') - Y = read_csv(filename, header=0, index_col=0).T + Y = read_csv(filename, header=0, index_col=0) num_repeats=4 num_time=4 num_cond=4 @@ -381,7 +381,7 @@ def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): download_data(data_set) from pandas import read_csv dirpath = os.path.join(data_path, data_set) - filename = os.path.join(dirpath, 'tomancak_expr.csv') + filename = os.path.join(dirpath, 'tomancak_exprs.csv') Y = read_csv(filename, header=0, index_col=0).T num_repeats = 3 num_time = 12 From 8d6eed60108fdfc668ea8b9d47a6545392767e27 Mon Sep 17 00:00:00 2001 From: mzwiessele Date: Wed, 14 May 2014 08:53:56 +0100 Subject: [PATCH 42/43] [param] hierarchy traversal easier now --- GPy/core/parameterization/lists_and_dicts.py | 8 ++- GPy/core/parameterization/param.py | 23 +++++++- GPy/core/parameterization/parameter_core.py | 56 ++++++++++---------- GPy/testing/pickle_tests.py | 3 ++ 4 files changed, 57 insertions(+), 33 deletions(-) diff --git a/GPy/core/parameterization/lists_and_dicts.py b/GPy/core/parameterization/lists_and_dicts.py index 64bdb077..084ab0db 100644 --- a/GPy/core/parameterization/lists_and_dicts.py +++ b/GPy/core/parameterization/lists_and_dicts.py @@ -88,19 +88,17 @@ class ObservablesList(object): def __iter__(self): self.flush() for p, o, c in self._poc: - if o() is not None: - yield p, o(), c + yield p, o(), c def __len__(self): self.flush() return self._poc.__len__() def __deepcopy__(self, memo): - self.flush() s = ObservablesList() - for p,o,c in self._poc: + for p,o,c in self: import copy - s.add(p, copy.deepcopy(o(), memo), copy.deepcopy(c, memo)) + s.add(p, copy.deepcopy(o, memo), copy.deepcopy(c, memo)) s.flush() return s diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 91bf3561..920072d7 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -169,8 +169,29 @@ class Param(OptimizationHandlable, ObsAr): # parameterizable #=========================================================================== def traverse(self, visit, *args, **kwargs): - visit(self, *args, **kwargs) + """ + Traverse the hierarchy performing visit(self, *args, **kwargs) at every node passed by. + See "visitor pattern" in literature. This is implemented in pre-order fashion. + This will function will just call visit on self, as Param are leaf nodes. + """ + visit(self, *args, **kwargs) + + def traverse_parents(self, visit, *args, **kwargs): + """ + Traverse the hierarchy upwards, visiting all parents and their children, except self. + See "visitor pattern" in literature. This is implemented in pre-order fashion. + + Example: + + parents = [] + self.traverse_parents(parents.append) + print parents + """ + if self.has_parent(): + self.__visited = True + self._parent_._traverse_parents(visit, *args, **kwargs) + self.__visited = False #=========================================================================== # Convenience diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index ba85be03..0a0ad067 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -176,24 +176,23 @@ class Pickleable(object): #raise NotImplementedError, "Copy is not yet implemented, TODO: Observable hierarchy" import copy memo = {} + # the next part makes sure that we do not include parents in any form: parents = [] - self.traverse_parents(parents.append) - # remove self, which is the first arguments - parents = [p for p in parents if p is not self] + self.traverse_parents(parents.append) # collect parents for p in parents: - memo[id(p)] = None - memo[id(self.gradient)] = None - memo[id(self.param_array)] = None - memo[id(self._fixes_)] = None - c = copy.deepcopy(self, memo) + memo[id(p)] = None # set all parents to be None, so they will not be copied + memo[id(self.gradient)] = None # reset the gradient + memo[id(self.param_array)] = None # and param_array + memo[id(self._fixes_)] = None # fixes have to be reset, as this is now highest parent + c = copy.deepcopy(self, memo) # and start the copy c._parent_index_ = None return c def __deepcopy__(self, memo): - s = self.__new__(self.__class__) - memo[id(self)] = s + s = self.__new__(self.__class__) # fresh instance + memo[id(self)] = s # be sure to break all cycles --> self is already done import copy - s.__dict__.update(copy.deepcopy(self.__dict__, memo)) + s.__dict__.update(copy.deepcopy(self.__dict__, memo)) # standard copy return s def __getstate__(self): @@ -580,12 +579,6 @@ class OptimizationHandlable(Constrainable): def __init__(self, name, default_constraint=None, *a, **kw): super(OptimizationHandlable, self).__init__(name, default_constraint=default_constraint, *a, **kw) - def transform(self): - [np.put(self.param_array, ind, c.finv(self.param_array.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__] - - def untransform(self): - [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) for c, ind in self.constraints.iteritems() if c != __fixed__] - def _get_params_transformed(self): # transformed parameters (apply transformation rules) p = self.param_array.copy() @@ -606,7 +599,8 @@ class OptimizationHandlable(Constrainable): self.param_array.flat[fixes] = p elif self._has_fixes(): self.param_array.flat[self._fixes_] = p else: self.param_array.flat = p - self.untransform() + [np.put(self.param_array, ind, c.f(self.param_array.flat[ind])) + for c, ind in self.constraints.iteritems() if c != __fixed__] self._trigger_params_changed() def _trigger_params_changed(self, trigger_parent=True): @@ -726,7 +720,9 @@ class Parameterizable(OptimizationHandlable): def traverse(self, visit, *args, **kwargs): """ - Traverse the hierarchy performing visit(self, *args, **kwargs) at every node passed by. + Traverse the hierarchy performing visit(self, *args, **kwargs) + at every node passed by downwards. This function includes self! + See "visitor pattern" in literature. This is implemented in pre-order fashion. Example: @@ -745,7 +741,7 @@ class Parameterizable(OptimizationHandlable): def traverse_parents(self, visit, *args, **kwargs): """ - Traverse the hierarchy upwards, visiting all parents and their children. + Traverse the hierarchy upwards, visiting all parents and their children except self. See "visitor pattern" in literature. This is implemented in pre-order fashion. Example: @@ -754,13 +750,20 @@ class Parameterizable(OptimizationHandlable): self.traverse_parents(parents.append) print parents """ - if not self.__visited: - visit(self, *args, **kwargs) + if self.has_parent(): self.__visited = True + self._parent_._traverse_parents(visit, *args, **kwargs) + self.__visited = False + + def _traverse_parents(self, visit, *args, **kwargs): + if not self.__visited: + self.__visited = True + visit(self, *args, **kwargs) if self.has_parent(): - self._parent_.traverse_parents(visit, *args, **kwargs) + self._parent_._traverse_parents(visit, *args, **kwargs) self._parent_.traverse(visit, *args, **kwargs) self.__visited = False + #========================================================================= # Gradient handling #========================================================================= @@ -827,11 +830,10 @@ class Parameterizable(OptimizationHandlable): # raise HierarchyError, "parameter {} already in another model ({}), create new object (or copy) for adding".format(param._short(), param._highest_parent_._short()) elif param not in self._parameters_: if param.has_parent(): - parent = param._parent_ - while parent is not None: + def visit(parent, self): if parent is self: raise HierarchyError, "You cannot add a parameter twice into the hierarchy" - parent = parent._parent_ + param.traverse_parents(visit, self) param._parent_.remove_parameter(param) # make sure the size is set if index is None: @@ -875,7 +877,7 @@ class Parameterizable(OptimizationHandlable): :param param: param object to remove from being a parameter of this parameterized object. """ if not param in self._parameters_: - raise RuntimeError, "Parameter {} does not belong to this object, remove parameters directly from their respective parents".format(param._short()) + raise RuntimeError, "Parameter {} does not belong to this object {}, remove parameters directly from their respective parents".format(param._short(), self.name) start = sum([p.size for p in self._parameters_[:param._parent_index_]]) self._remove_parameter_name(param) diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py index 37dd6e0b..b62f5e45 100644 --- a/GPy/testing/pickle_tests.py +++ b/GPy/testing/pickle_tests.py @@ -132,6 +132,9 @@ class Test(ListDictTestCase): self.assertIsNot(par.full_gradient, pcopy.full_gradient) self.assertTrue(pcopy.checkgrad()) self.assert_(np.any(pcopy.gradient!=0.0)) + pcopy.optimize('bfgs') + par.optimize('bfgs') + np.testing.assert_allclose(pcopy.param_array, par.param_array, atol=.001) with tempfile.TemporaryFile('w+b') as f: par.pickle(f) f.seek(0) From ec70fef7809d27dd4658e247e664695f5336f2e1 Mon Sep 17 00:00:00 2001 From: James Hensman Date: Wed, 14 May 2014 10:04:58 +0100 Subject: [PATCH 43/43] minor edit in scg, raise notimplemented dL_dX in hierarchical --- GPy/inference/optimization/scg.py | 2 +- GPy/kern/_src/independent_outputs.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/GPy/inference/optimization/scg.py b/GPy/inference/optimization/scg.py index c99fa7d1..503c19be 100644 --- a/GPy/inference/optimization/scg.py +++ b/GPy/inference/optimization/scg.py @@ -32,7 +32,7 @@ def print_out(len_maxiters, fnow, current_grad, beta, iteration): sys.stdout.flush() def exponents(fnow, current_grad): - exps = [np.abs(fnow), current_grad] + exps = [np.abs(np.float(fnow)), current_grad] return np.sign(exps) * np.log10(exps).astype(int) def SCG(f, gradf, x, optargs=(), maxiters=500, max_f_eval=np.inf, display=True, xtol=None, ftol=None, gtol=None): diff --git a/GPy/kern/_src/independent_outputs.py b/GPy/kern/_src/independent_outputs.py index 12c51ca3..64314197 100644 --- a/GPy/kern/_src/independent_outputs.py +++ b/GPy/kern/_src/independent_outputs.py @@ -180,6 +180,9 @@ class Hierarchical(CombinationKernel): def Kdiag(self,X): return np.diag(self.K(X)) + def gradients_X(self, dL_dK, X, X2=None): + raise NotImplementedError + def update_gradients_full(self,dL_dK,X,X2=None): slices = [index_to_slices(X[:,i]) for i in self.extra_dims] if X2 is None: