diff --git a/GPy/core/parameterization/observable_array.py b/GPy/core/parameterization/observable_array.py index cd0c85d6..a280d74f 100644 --- a/GPy/core/parameterization/observable_array.py +++ b/GPy/core/parameterization/observable_array.py @@ -1,7 +1,7 @@ # Copyright (c) 2012, GPy authors (see AUTHORS.txt). # Licensed under the BSD 3-clause license (see LICENSE.txt) -__updated__ = '2014-03-31' +__updated__ = '2014-04-15' import numpy as np from parameter_core import Observable, Pickleable diff --git a/GPy/core/parameterization/param.py b/GPy/core/parameterization/param.py index 9c3d7bd3..c22ad3c5 100644 --- a/GPy/core/parameterization/param.py +++ b/GPy/core/parameterization/param.py @@ -3,6 +3,7 @@ import itertools import numpy +np = numpy from parameter_core import OptimizationHandlable, adjust_name_for_printing from observable_array import ObsAr @@ -118,10 +119,6 @@ class Param(OptimizationHandlable, ObsAr): except AttributeError: pass # returning 0d array or float, double etc return new_arr - def __setitem__(self, s, val): - super(Param, self).__setitem__(s, val) - - def _raveled_index(self, slice_index=None): # return an index array on the raveled array, which is formed by the current_slice # of this object @@ -311,15 +308,15 @@ class ParamConcatenation(object): #=========================================================================== def __getitem__(self, s): ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; - params = [p.param_array[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p.param_array[ind[ps]])] + params = [p.param_array.flat[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p.param_array.flat[ind[ps]])] if len(params)==1: return params[0] return ParamConcatenation(params) def __setitem__(self, s, val, update=True): if isinstance(val, ParamConcatenation): val = val.values() ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True; - vals = self.values(); vals[s] = val; del val - [numpy.place(p, ind[ps], vals[ps]) + vals = self.values(); vals[s] = val + [numpy.copyto(p, vals[ps], where=ind[ps]) for p, ps in zip(self.params, self._param_slices_)] if update: self.update_all_params() @@ -342,8 +339,8 @@ class ParamConcatenation(object): self.update_all_params() constrain_positive.__doc__ = Param.constrain_positive.__doc__ - def constrain_fixed(self, warning=True): - [param.constrain_fixed(warning) for param in self.params] + def constrain_fixed(self, value=None, warning=True, trigger_parent=True): + [param.constrain_fixed(value, warning, trigger_parent) for param in self.params] constrain_fixed.__doc__ = Param.constrain_fixed.__doc__ fix = constrain_fixed @@ -411,3 +408,42 @@ class ParamConcatenation(object): return "\n".join(strings) def __repr__(self): return "\n".join(map(repr,self.params)) + + def __ilshift__(self, *args, **kwargs): + self[:] = np.ndarray.__ilshift__(self.values(), *args, **kwargs) + + def __irshift__(self, *args, **kwargs): + self[:] = np.ndarray.__irshift__(self.values(), *args, **kwargs) + + def __ixor__(self, *args, **kwargs): + self[:] = np.ndarray.__ixor__(self.values(), *args, **kwargs) + + def __ipow__(self, *args, **kwargs): + self[:] = np.ndarray.__ipow__(self.values(), *args, **kwargs) + + def __ifloordiv__(self, *args, **kwargs): + self[:] = np.ndarray.__ifloordiv__(self.values(), *args, **kwargs) + + def __isub__(self, *args, **kwargs): + self[:] = np.ndarray.__isub__(self.values(), *args, **kwargs) + + def __ior__(self, *args, **kwargs): + self[:] = np.ndarray.__ior__(self.values(), *args, **kwargs) + + def __itruediv__(self, *args, **kwargs): + self[:] = np.ndarray.__itruediv__(self.values(), *args, **kwargs) + + def __idiv__(self, *args, **kwargs): + self[:] = np.ndarray.__idiv__(self.values(), *args, **kwargs) + + def __iand__(self, *args, **kwargs): + self[:] = np.ndarray.__iand__(self.values(), *args, **kwargs) + + def __imod__(self, *args, **kwargs): + self[:] = np.ndarray.__imod__(self.values(), *args, **kwargs) + + def __iadd__(self, *args, **kwargs): + self[:] = np.ndarray.__iadd__(self.values(), *args, **kwargs) + + def __imul__(self, *args, **kwargs): + self[:] = np.ndarray.__imul__(self.values(), *args, **kwargs) diff --git a/GPy/core/parameterization/parameter_core.py b/GPy/core/parameterization/parameter_core.py index 43bc7177..b513ba44 100644 --- a/GPy/core/parameterization/parameter_core.py +++ b/GPy/core/parameterization/parameter_core.py @@ -15,8 +15,9 @@ Observable Pattern for patameterization from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED import numpy as np +import re -__updated__ = '2014-03-31' +__updated__ = '2014-04-16' class HierarchyError(Exception): """ @@ -28,7 +29,15 @@ def adjust_name_for_printing(name): Make sure a name can be printed, alongside used as a variable name. """ if name is not None: - return name.replace(" ", "_").replace(".", "_").replace("-", "_m_").replace("+", "_p_").replace("!", "_I_").replace("**", "_xx_").replace("*", "_x_").replace("/", "_l_").replace("@", '_at_') + name2 = name + name = name.replace(" ", "_").replace(".", "_").replace("-", "_m_") + name = name.replace("+", "_p_").replace("!", "_I_") + name = name.replace("**", "_xx_").replace("*", "_x_") + name = name.replace("/", "_l_").replace("@", '_at_') + name = name.replace("(", "_of_").replace(")", "") + if re.match(r'^[a-zA-Z_][a-zA-Z0-9-_]*$', name) is None: + raise NameError, "name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name) + return name return '' @@ -458,7 +467,7 @@ class Constrainable(Nameable, Indexable, Observable): Constrain the parameter to the given :py:class:`GPy.core.transformations.Transformation`. """ - self.param_array[:] = transform.initialize(self.param_array) + self.param_array[...] = transform.initialize(self.param_array) reconstrained = self.unconstrain() self._add_to_index_operations(self.constraints, reconstrained, transform, warning) self.notify_observers(self, None if trigger_parent else -np.inf) diff --git a/GPy/core/parameterization/parameterized.py b/GPy/core/parameterization/parameterized.py index a794ab40..738f0485 100644 --- a/GPy/core/parameterization/parameterized.py +++ b/GPy/core/parameterization/parameterized.py @@ -185,6 +185,8 @@ class Parameterized(Parameterizable): return ParamConcatenation(paramlist) def __setitem__(self, name, value, paramlist=None): + if value is None: + return # nothing to do here if isinstance(name, (slice, tuple, np.ndarray)): try: self.param_array[name] = value @@ -197,8 +199,8 @@ class Parameterized(Parameterizable): param[:] = value def __setattr__(self, name, val): - # override the default behaviour, if setting a param, so broadcasting can by used - if hasattr(self, '_parameters_'): + # override the default behaviour, if setting a param, so broadcasting can by used + if hasattr(self, "_parameters_"): pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False) if name in pnames: self._parameters_[pnames.index(name)][:] = val; return object.__setattr__(self, name, val); diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py index 7344b204..0cc841ed 100644 --- a/GPy/inference/latent_function_inference/var_dtc.py +++ b/GPy/inference/latent_function_inference/var_dtc.py @@ -192,17 +192,22 @@ class VarDTC(object): class VarDTCMissingData(object): const_jitter = 1e-6 - def __init__(self, limit=1): + def __init__(self, limit=1, inan=None): from ...util.caching import Cacher self._Y = Cacher(self._subarray_computations, limit) + self._inan = inan pass def set_limit(self, limit): self._Y.limit = limit def _subarray_computations(self, Y): - inan = np.isnan(Y) - has_none = inan.any() + if self._inan is None: + inan = np.isnan(Y) + has_none = inan.any() + else: + inan = self._inan + has_none = True if has_none: from ...util.subarray_and_sorting import common_subarrays self._subarray_indices = [] diff --git a/GPy/kern/_src/kern.py b/GPy/kern/_src/kern.py index f871e676..6daff739 100644 --- a/GPy/kern/_src/kern.py +++ b/GPy/kern/_src/kern.py @@ -13,7 +13,7 @@ class Kern(Parameterized): #=========================================================================== # This adds input slice support. The rather ugly code for slicing can be # found in kernel_slice_operations - #__metaclass__ = KernCallsViaSlicerMeta + __metaclass__ = KernCallsViaSlicerMeta #=========================================================================== _support_GPU=False def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw): @@ -21,26 +21,50 @@ class Kern(Parameterized): The base class for a kernel: a positive definite function which forms of a covariance function (kernel). + input_dim: + + is the number of dimensions to work on. Make sure to give the + tight dimensionality of inputs. + You most likely want this to be the integer telling the number of + input dimensions of the kernel. + If this is not an integer (!) we will work on the whole input matrix X, + and not check whether dimensions match or not (!). + + active_dims: + + is the active_dimensions of inputs X we will work on. + All kernels will get sliced Xes as inputs, if active_dims is not None + if active_dims is None, slicing is switched off and all X will be passed through as given. + :param int input_dim: the number of input dimensions to the function - :param array-like|slice active_dims: list of indices on which dimensions this kernel works on + :param array-like|slice|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing Do not instantiate. """ super(Kern, self).__init__(name=name, *a, **kw) - self.active_dims = active_dims if active_dims is not None else slice(0, input_dim) - self.input_dim = input_dim - assert isinstance(self.active_dims, (slice, list, tuple, np.ndarray)), 'active_dims needs to be an array-like or slice object over dimensions, {} given'.format(self.active_dims.__class__) - if isinstance(self.active_dims, slice): - self.active_dims = slice(self.active_dims.start or 0, self.active_dims.stop or self.input_dim, self.active_dims.step or 1) - active_dim_size = int(np.round((self.active_dims.stop-self.active_dims.start)/self.active_dims.step)) - elif isinstance(self.active_dims, np.ndarray): - assert self.active_dims.ndim == 1, 'only flat indices allowed, given active_dims.shape={}, provide only indexes to the dimensions of the input'.format(self.active_dims.shape) - active_dim_size = self.active_dims.size - else: - active_dim_size = len(self.active_dims) - assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims) + try: + self.input_dim = int(input_dim) + self.active_dims = active_dims# if active_dims is not None else slice(0, input_dim, 1) + except TypeError: + # input_dim is something else then an integer + self.input_dim = input_dim + if active_dims is not None: + print "WARNING: given input_dim={} is not an integer and active_dims={} is given, switching off slicing" + self.active_dims = None + + if self.active_dims is not None and self.input_dim is not None: + assert isinstance(self.active_dims, (slice, list, tuple, np.ndarray)), 'active_dims needs to be an array-like or slice object over dimensions, {} given'.format(self.active_dims.__class__) + if isinstance(self.active_dims, slice): + self.active_dims = slice(self.active_dims.start or 0, self.active_dims.stop or self.input_dim, self.active_dims.step or 1) + active_dim_size = int(np.round((self.active_dims.stop-self.active_dims.start)/self.active_dims.step)) + elif isinstance(self.active_dims, np.ndarray): + #assert np.all(self.active_dims >= 0), 'active dimensions need to be positive. negative indexing is not allowed' + assert self.active_dims.ndim == 1, 'only flat indices allowed, given active_dims.shape={}, provide only indexes to the dimensions (columns) of the input'.format(self.active_dims.shape) + active_dim_size = self.active_dims.size + else: + active_dim_size = len(self.active_dims) + assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims) self._sliced_X = 0 - self.useGPU = self._support_GPU and useGPU @Cache_this(limit=10) @@ -205,9 +229,12 @@ class CombinationKernel(Kern): return self._parameters_ def get_input_dim_active_dims(self, kernels, extra_dims = None): - active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int)) - input_dim = active_dims.max()+1 + (len(np.r_[extra_dims]) if extra_dims is not None else 0) - active_dims = slice(0, input_dim, 1) + #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int)) + #active_dims = np.array(np.concatenate((active_dims, extra_dims if extra_dims is not None else [])), dtype=int) + input_dim = np.array([k.input_dim for k in kernels]) + if np.all(input_dim[0]==input_dim): + input_dim = input_dim[0] + active_dims = None return input_dim, active_dims def input_sensitivity(self): diff --git a/GPy/kern/_src/kernel_slice_operations.py b/GPy/kern/_src/kernel_slice_operations.py index a4bb8f62..10dbacee 100644 --- a/GPy/kern/_src/kernel_slice_operations.py +++ b/GPy/kern/_src/kernel_slice_operations.py @@ -33,8 +33,11 @@ class _Slice_wrap(object): def __init__(self, k, X, X2=None): self.k = k self.shape = X.shape - if self.k._sliced_X == 0: - assert X.shape[1] > max(np.r_[self.k.active_dims]), "At least {} dimensional X needed".format(max(np.r_[self.k.active_dims])) + assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape) + if X2 is not None: + assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape) + if (self.k.active_dims is not None) and (self.k._sliced_X == 0): + assert X.shape[1] >= len(np.r_[self.k.active_dims]), "At least {} dimensional X needed, X.shape={!s}".format(len(np.r_[self.k.active_dims]), X.shape) self.X = self.k._slice_X(X) self.X2 = self.k._slice_X(X2) if X2 is not None else X2 self.ret = True diff --git a/GPy/kern/_src/stationary.py b/GPy/kern/_src/stationary.py index 37acbf2d..a560f8ad 100644 --- a/GPy/kern/_src/stationary.py +++ b/GPy/kern/_src/stationary.py @@ -139,7 +139,7 @@ class Stationary(Kern): #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3 tmp = dL_dr*self._inv_dist(X, X2) if X2 is None: X2 = X - self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(self._slice_X(X)[:,q:q+1] - self._slice_X(X2)[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)]) + self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)]) else: r = self._scaled_dist(X, X2) self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale diff --git a/GPy/testing/kernel_tests.py b/GPy/testing/kernel_tests.py index 91683edc..5bd3f494 100644 --- a/GPy/testing/kernel_tests.py +++ b/GPy/testing/kernel_tests.py @@ -9,8 +9,6 @@ from GPy.core.parameterization.param import Param verbose = 0 -np.random.seed(50) - class Kern_check_model(GPy.core.Model): """ @@ -260,7 +258,6 @@ class KernelGradientTestsContinuous(unittest.TestCase): self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) def test_Prod3(self): - k = GPy.kern.Matern32(2, active_dims=[2,3]) * (GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)) k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D)) k.randomize() self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) @@ -274,7 +271,7 @@ class KernelGradientTestsContinuous(unittest.TestCase): def test_Add_dims(self): k = GPy.kern.Matern32(2, active_dims=[2,self.D]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D) k.randomize() - self.assertRaises(AssertionError, k.K, self.X) + self.assertRaises(IndexError, k.K, self.X) k = GPy.kern.Matern32(2, active_dims=[2,self.D-1]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D) k.randomize() # assert it runs: @@ -303,46 +300,26 @@ class KernelGradientTestsContinuous(unittest.TestCase): k.randomize() self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) -#TODO: turn off grad checkingwrt X for indexed kernels like coregionalize -# class KernelGradientTestsContinuous1D(unittest.TestCase): -# def setUp(self): -# self.N, self.D = 100, 1 -# self.X = np.random.randn(self.N,self.D) -# self.X2 = np.random.randn(self.N+10,self.D) -# -# continuous_kerns = ['RBF', 'Linear'] -# self.kernclasses = [getattr(GPy.kern, s) for s in continuous_kerns] -# -# def test_PeriodicExponential(self): -# k = GPy.kern.PeriodicExponential(self.D) -# k.randomize() -# self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) -# -# def test_PeriodicMatern32(self): -# k = GPy.kern.PeriodicMatern32(self.D) -# k.randomize() -# self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) -# -# def test_PeriodicMatern52(self): -# k = GPy.kern.PeriodicMatern52(self.D) -# k.randomize() -# self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose)) - - class KernelTestsMiscellaneous(unittest.TestCase): def setUp(self): N, D = 100, 10 - self.X = np.linspace(-np.pi, +np.pi, N)[:,None] * np.ones(D) + self.X = np.linspace(-np.pi, +np.pi, N)[:,None] * np.random.uniform(-10,10,D) self.rbf = GPy.kern.RBF(2, active_dims=slice(0,4,2)) self.linear = GPy.kern.Linear(2, active_dims=(3,9)) - self.matern = GPy.kern.Matern32(3, active_dims=np.array([2,4,9])) + self.matern = GPy.kern.Matern32(3, active_dims=np.array([1,7,9])) self.sumkern = self.rbf + self.linear self.sumkern += self.matern self.sumkern.randomize() def test_active_dims(self): - self.assertEqual(self.sumkern.input_dim, 10) - self.assertEqual(self.sumkern.active_dims, slice(0, 10, 1)) + # test the automatic dim detection expression for slices: + start, stop = 0, 277 + for i in range(start,stop,7): + for j in range(1,4): + GPy.kern.Kern(int(np.round((i+1)/j)), slice(0, i+1, j), "testkern") + # test the ability to have only one dim + sk = GPy.kern.RBF(2) + GPy.kern.Matern32(2) + self.assertEqual(sk.input_dim, 2) def test_which_parts(self): self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.matern]), self.linear.K(self.X)+self.matern.K(self.X))) @@ -365,7 +342,7 @@ class KernelTestsNonContinuous(unittest.TestCase): self.X2 = np.random.randn((N0+N1)*2, self.D+1) self.X2[:(N0*2), -1] = 0 self.X2[(N0*2):, -1] = 1 - + def test_IndependentOutputs(self): k = GPy.kern.RBF(self.D) kern = GPy.kern.IndependentOutputs(k, -1, 'ind_single') @@ -373,7 +350,7 @@ class KernelTestsNonContinuous(unittest.TestCase): k = [GPy.kern.RBF(1, active_dims=[1], name='rbf1'), GPy.kern.RBF(self.D, name='rbf012'), GPy.kern.RBF(2, active_dims=[0,2], name='rbf02')] kern = GPy.kern.IndependentOutputs(k, -1, name='ind_split') self.assertTrue(check_kernel_gradient_functions(kern, X=self.X, X2=self.X2, verbose=verbose, fixed_X_dims=-1)) - + def test_ODE_UY(self): kern = GPy.kern.ODE_UY(2, active_dims=[0, self.D]) X = self.X[self.X[:,-1]!=2] diff --git a/GPy/testing/model_tests.py b/GPy/testing/model_tests.py index 4d20035d..6f80f418 100644 --- a/GPy/testing/model_tests.py +++ b/GPy/testing/model_tests.py @@ -130,6 +130,17 @@ class MiscTests(unittest.TestCase): m2.kern[:] = m.kern[''].values() np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood()) + def test_model_set_params(self): + m = GPy.models.GPRegression(self.X, self.Y) + lengthscale = np.random.uniform() + m.kern.lengthscale = lengthscale + np.testing.assert_equal(m.kern.lengthscale, lengthscale) + m.kern.lengthscale *= 1 + m['.*var'] -= .1 + np.testing.assert_equal(m.kern.lengthscale, lengthscale) + m.optimize() + print m + def test_model_optimize(self): X = np.random.uniform(-3., 3., (20, 1)) Y = np.sin(X) + np.random.randn(20, 1) * 0.05 diff --git a/GPy/testing/parameterized_tests.py b/GPy/testing/parameterized_tests.py index 8bfaab4e..57669e93 100644 --- a/GPy/testing/parameterized_tests.py +++ b/GPy/testing/parameterized_tests.py @@ -142,6 +142,17 @@ class ParameterizedTest(unittest.TestCase): self.testmodel.randomize() self.assertEqual(val, self.testmodel.kern.lengthscale) + def test_regular_expression_misc(self): + self.testmodel.kern.lengthscale.fix() + val = float(self.testmodel.kern.lengthscale) + self.testmodel.randomize() + self.assertEqual(val, self.testmodel.kern.lengthscale) + + variances = self.testmodel['.*var'].values() + self.testmodel['.*var'].fix() + self.testmodel.randomize() + np.testing.assert_equal(variances, self.testmodel['.*var'].values()) + def test_printing(self): print self.test1 print self.param diff --git a/GPy/testing/pickle_tests.py b/GPy/testing/pickle_tests.py index d975aaa3..37dd6e0b 100644 --- a/GPy/testing/pickle_tests.py +++ b/GPy/testing/pickle_tests.py @@ -185,6 +185,7 @@ class Test(ListDictTestCase): def _callback(self, what, which): what.count += 1 + @unittest.skip def test_add_observer(self): par = toy_rbf_1d_50(optimize=0, plot=0) par.name = "original" diff --git a/GPy/util/data_resources.json b/GPy/util/data_resources.json index 57b79f10..845d56be 100644 --- a/GPy/util/data_resources.json +++ b/GPy/util/data_resources.json @@ -1,409 +1 @@ -{ - "rogers_girolami_data":{ - "files":[ - [ - "firstcoursemldata.tar.gz" - ] - ], - "license":null, - "citation":"A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", - "details":"Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", - "urls":[ - "https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/" - ], - "suffices":[ - [ - "?dl=1" - ] - ], - "size":21949154 - }, - "ankur_pose_data":{ - "files":[ - [ - "ankurDataPoseSilhouette.mat" - ] - ], - "citation":"3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", - "license":null, - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/" - ], - "details":"Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing.", - "size":1 - }, - "football_data":{ - "files":[ - [ - "E0.csv", "E1.csv", "E2.csv", "E3.csv" - ] - ], - "citation":"", - "license":null, - "urls":[ - "http://www.football-data.co.uk/mmz4281/" - ], - "details":"Results of English football matches since 1993/94 season.", - "size":1 - }, - "google_trends":{ - "files":[ - [ - ] - ], - "citation":"", - "license":null, - "urls":[ - "http://www.google.com/trends/" - ], - "details":"Google trends results.", - "size":0 - }, - "osu_accad":{ - "files":[ - [ - "swagger1TXT.ZIP", - "handspring1TXT.ZIP", - "quickwalkTXT.ZIP", - "run1TXT.ZIP", - "sprintTXT.ZIP", - "dogwalkTXT.ZIP", - "camper_04TXT.ZIP", - "dance_KB3_TXT.ZIP", - "per20_TXT.ZIP", - "perTWO07_TXT.ZIP", - "perTWO13_TXT.ZIP", - "perTWO14_TXT.ZIP", - "perTWO15_TXT.ZIP", - "perTWO16_TXT.ZIP" - ], - [ - "connections.txt" - ] - ], - "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", - "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", - "details":"Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", - "urls":[ - "http://accad.osu.edu/research/mocap/data/", - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" - ], - "size":15922790 - }, - "isomap_face_data":{ - "files":[ - [ - "face_data.mat" - ] - ], - "license":null, - "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", - "details":"Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/" - ], - "size":24229368 - }, - "boston_housing":{ - "files":[ - [ - "Index", - "housing.data", - "housing.names" - ] - ], - "license":null, - "citation":"Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", - "details":"The Boston Housing data relates house values in Boston to a range of input variables.", - "urls":[ - "http://archive.ics.uci.edu/ml/machine-learning-databases/housing/" - ], - "size":51276 - }, - "cmu_mocap_full":{ - "files":[ - [ - "allasfamc.zip" - ] - ], - "license":"From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", - "citation":"Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.\nThe database was created with funding from NSF EIA-0196217.", - "details":"CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", - "urls":[ - "http://mocap.cs.cmu.edu/subjects" - ], - "size":null - }, - "brendan_faces":{ - "files":[ - [ - "frey_rawface.mat" - ] - ], - "license":null, - "citation":"Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", - "details":"A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", - "urls":[ - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":1100584 - }, - "olympic_marathon_men":{ - "files":[ - [ - "olympicMarathonTimes.csv" - ] - ], - "license":null, - "citation":null, - "details":"Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/" - ], - "size":584 - }, - "pumadyn-32nm":{ - "files":[ - [ - "pumadyn-32nm.tar.gz" - ] - ], - "license":"Data is made available by the Delve system at the University of Toronto", - "citation":"Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", - "details":"Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", - "urls":[ - "ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/" - ], - "size":5861646 - }, - "ripley_prnn_data":{ - "files":[ - [ - "Cushings.dat", - "README", - "crabs.dat", - "fglass.dat", - "fglass.grp", - "pima.te", - "pima.tr", - "pima.tr2", - "synth.te", - "synth.tr", - "viruses.dat", - "virus3.dat" - ] - ], - "license":null, - "citation":"Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", - "details":"Data sets from Brian Ripley's Pattern Recognition and Neural Networks", - "urls":[ - "http://www.stats.ox.ac.uk/pub/PRNN/" - ], - "size":93565 - }, - "three_phase_oil_flow":{ - "files":[ - [ - "DataTrnLbls.txt", - "DataTrn.txt", - "DataTst.txt", - "DataTstLbls.txt", - "DataVdn.txt", - "DataVdnLbls.txt" - ] - ], - "license":null, - "citation":"Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", - "details":"The three phase oil data used initially for demonstrating the Generative Topographic mapping.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/" - ], - "size":712796 - }, - "robot_wireless":{ - "files":[ - [ - "uw-floor.txt" - ] - ], - "license":null, - "citation":"WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", - "details":"Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/" - ], - "size":284390 - }, - "xw_pen":{ - "files":[ - [ - "xw_pen_15.csv" - ] - ], - "license":null, - "citation":"Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", - "details":"Accelerometer pen data used for robust regression by Tipping and Lawrence.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/" - ], - "size":3410 - }, - "swiss_roll":{ - "files":[ - [ - "swiss_roll_data.mat" - ] - ], - "license":null, - "citation":"A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", - "details":"Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", - "urls":[ - "http://isomap.stanford.edu/" - ], - "size":800256 - }, - "osu_run1":{ - "files":[ - [ - "run1TXT.ZIP" - ], - [ - "connections.txt" - ] - ], - "license":"Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", - "citation":"The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", - "details":"Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", - "urls":[ - "http://accad.osu.edu/research/mocap/data/", - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/" - ], - "size":338103 - }, - "creep_rupture":{ - "files":[ - [ - "creeprupt.tar" - ] - ], - "license":null, - "citation":"Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", - "details":"Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", - "urls":[ - "http://www.msm.cam.ac.uk/map/data/tar/" - ], - "size":602797 - }, - "olivetti_faces":{ - "files":[ - [ - "att_faces.zip" - ], - [ - "olivettifaces.mat" - ] - ], - "license":null, - "citation":"Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", - "details":"Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":8561331 - }, - "olivetti_glasses":{ - "files":[ - [ - "has_glasses.np" - ], - [ - "olivettifaces.mat" - ] - ], - "license":null, - "citation":"Information recorded in olivetti_faces entry. Should be used from there.", - "details":"Information recorded in olivetti_faces entry. Should be used from there.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", - "http://www.cs.nyu.edu/~roweis/data/" - ], - "size":4261047 - }, - "della_gatta":{ - "files":[ - [ - "DellaGattadata.mat" - ] - ], - "license":null, - "citation":"Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", - "details":"The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/" - ], - "size":3729650 - }, - "epomeo_gpx":{ - "files":[ - [ - "endomondo_1.gpx", - "endomondo_2.gpx", - "garmin_watch_via_endomondo.gpx", - "viewranger_phone.gpx", - "viewranger_tablet.gpx" - ] - ], - "license":null, - "citation":"", - "details":"Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/" - ], - "size":2031872 - }, - "mauna_loa":{ - "files":[ - [ - "co2_mm_mlo.txt" - ] - ], - "license":"-------------------------------------------------------------------- USE OF NOAA ESRL DATA\n\n These data are made freely available to the public and the scientific community in the belief that their wide dissemination will lead to greater understanding and new scientific insights. The availability of these data does not constitute publication of the data. NOAA relies on the ethics and integrity of the user to insure that ESRL receives fair credit for their work. If the data are obtained for potential use in a publication or presentation, ESRL should be informed at the outset of the nature of this work. If the ESRL data are essential to the work, or if an important result or conclusion depends on the ESRL data, co-authorship may be appropriate. This should be discussed at an early stage in the work. Manuscripts using the ESRL data should be sent to ESRL for review before they are submitted for publication so we can insure that the quality and limitations of the data are accurately represented.\n\n Contact: Pieter Tans (303 497 6678; pieter.tans@noaa.gov)\n\n RECIPROCITY Use of these data implies an agreement to reciprocate. Laboratories making similar measurements agree to make their own data available to the general public and to the scientific community in an equally complete and easily accessible form. Modelers are encouraged to make available to the community, upon request, their own tools used in the interpretation of the ESRL data, namely well documented model code, transport fields, and additional information necessary for other scientists to repeat the work and to run modified versions. Model availability includes collaborative support for new users of the models.\n --------------------------------------------------------------------\n\n See www.esrl.noaa.gov/gmd/ccgg/trends/ for additional details.", - "citation":"Mauna Loa Data. Dr. Pieter Tans, NOAA/ESRL (www.esrl.noaa.gov/gmd/ccgg/trends/) and Dr. Ralph Keeling, Scripps Institution of Oceanography (scrippsco2.ucsd.edu/).", - "details":"The 'average' column contains the monthly mean CO2 mole fraction determined from daily averages. The mole fraction of CO2, expressed as parts per million (ppm) is the number of molecules of CO2 in every one million molecules of dried air (water vapor removed). If there are missing days concentrated either early or late in the month, the monthly mean is corrected to the middle of the month using the average seasonal cycle. Missing months are denoted by -99.99. The 'interpolated' column includes average values from the preceding column and interpolated values where data are missing. Interpolated values are computed in two steps. First, we compute for each month the average seasonal cycle in a 7-year window around each monthly value. In this way the seasonal cycle is allowed to change slowly over time. We then determine the 'trend' value for each month by removing the seasonal cycle; this result is shown in the 'trend' column. Trend values are linearly interpolated for missing months. The interpolated monthly mean is then the sum of the average seasonal cycle value and the trend value for the missing month.\n\nNOTE: In general, the data presented for the last year are subject to change, depending on recalibration of the reference gas mixtures used, and other quality control procedures. Occasionally, earlier years may also be changed for the same reasons. Usually these changes are minor.\n\nCO2 expressed as a mole fraction in dry air, micromol/mol, abbreviated as ppm \n\n (-99.99 missing data; -1 no data for daily means in month)", - "urls":[ - "ftp://aftp.cmdl.noaa.gov/products/trends/co2/" - ], - "size":46779 - }, - "boxjenkins_airline":{ - "files":[ - [ - "boxjenkins_airline.csv" - ] - ], - "license":"You may copy and redistribute the data. You may make derivative works from the data. You may use the data for commercial purposes. You may not sublicence the data when redistributing it. You may not redistribute the data under a different license. Source attribution on any use of this data: Must refer source.", - "citation":"Box & Jenkins (1976), in file: data/airpass, Description: International airline passengers: monthly totals in thousands. Jan 49 – Dec 60", - "details":"International airline passengers, monthly totals from January 1949 to December 1960.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/boxjenkins_airline/" - ], - "size":46779 - }, - - "decampos_characters":{ - "files":[ - [ - "characters.npy", - "digits.npy" - ] - ], - "license":null, - "citation":"T. de Campos, B. R. Babu, and M. Varma. Character recognition in natural images. VISAPP 2009.", - "details":"Examples of hand written digits taken from the de Campos et al paper on Character Recognition in Natural Images.", - "urls":[ - "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/decampos_digits/" - ], - "size":2031872 - } -} +{"rogers_girolami_data": {"files": [["firstcoursemldata.tar.gz"]], "license": null, "citation": "A First Course in Machine Learning. Simon Rogers and Mark Girolami: Chapman & Hall/CRC, ISBN-13: 978-1439824146", "details": "Data from the textbook 'A First Course in Machine Learning'. Available from http://www.dcs.gla.ac.uk/~srogers/firstcourseml/.", "urls": ["https://www.dropbox.com/sh/7p6tu1t29idgliq/_XqlH_3nt9/"], "suffices": [["?dl=1"]], "size": 21949154}, "ankur_pose_data": {"files": [["ankurDataPoseSilhouette.mat"]], "citation": "3D Human Pose from Silhouettes by Relevance Vector Regression (In CVPR'04). A. Agarwal and B. Triggs.", "license": null, "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/ankur_pose_data/"], "details": "Artificially generated data of silhouettes given poses. Note that the data does not display a left/right ambiguity because across the entire data set one of the arms sticks out more the the other, disambiguating the pose as to which way the individual is facing."}, "osu_accad": {"files": [["swagger1TXT.ZIP", "handspring1TXT.ZIP", "quickwalkTXT.ZIP", "run1TXT.ZIP", "sprintTXT.ZIP", "dogwalkTXT.ZIP", "camper_04TXT.ZIP", "dance_KB3_TXT.ZIP", "per20_TXT.ZIP", "perTWO07_TXT.ZIP", "perTWO13_TXT.ZIP", "perTWO14_TXT.ZIP", "perTWO15_TXT.ZIP", "perTWO16_TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of different motions from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 15922790}, "isomap_face_data": {"files": [["face_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Face data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/isomap_face_data/"], "size": 24229368}, "boston_housing": {"files": [["Index", "housing.data", "housing.names"]], "license": null, "citation": "Harrison, D. and Rubinfeld, D.L. 'Hedonic prices and the demand for clean air', J. Environ. Economics & Management, vol.5, 81-102, 1978.", "details": "The Boston Housing data relates house values in Boston to a range of input variables.", "urls": ["http://archive.ics.uci.edu/ml/machine-learning-databases/housing/"], "size": 51276}, "cmu_mocap_full": {"files": [["allasfamc.zip"]], "license": "From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.", "citation": "Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'\n 'The database was created with funding from NSF EIA-0196217.", "details": "CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.", "urls": ["http://mocap.cs.cmu.edu"], "size": null}, "brendan_faces": {"files": [["frey_rawface.mat"]], "license": null, "citation": "Frey, B. J., Colmenarez, A and Huang, T. S. Mixtures of Local Linear Subspaces for Face Recognition. Proceedings of the IEEE Conference on Computer Vision and Pattern Recognition 1998, 32-37, June 1998. Computer Society Press, Los Alamitos, CA.", "details": "A video of Brendan Frey's face popularized as a benchmark for visualization by the Locally Linear Embedding.", "urls": ["http://www.cs.nyu.edu/~roweis/data/"], "size": 1100584}, "olympic_marathon_men": {"files": [["olympicMarathonTimes.csv"]], "license": null, "citation": null, "details": "Olympic mens' marathon gold medal winning times from 1896 to 2012. Time given in pace (minutes per kilometer). Data is originally downloaded and collated from Wikipedia, we are not responsible for errors in the data", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olympic_marathon_men/"], "size": 584}, "pumadyn-32nm": {"files": [["pumadyn-32nm.tar.gz"]], "license": "Data is made available by the Delve system at the University of Toronto", "citation": "Created by Zoubin Ghahramani using the Matlab Robotics Toolbox of Peter Corke. Corke, P. I. (1996). A Robotics Toolbox for MATLAB. IEEE Robotics and Automation Magazine, 3 (1): 24-32.", "details": "Pumadyn non linear 32 input data set with moderate noise. See http://www.cs.utoronto.ca/~delve/data/pumadyn/desc.html for details.", "urls": ["ftp://ftp.cs.toronto.edu/pub/neuron/delve/data/tarfiles/pumadyn-family/"], "size": 5861646}, "ripley_prnn_data": {"files": [["Cushings.dat", "README", "crabs.dat", "fglass.dat", "fglass.grp", "pima.te", "pima.tr", "pima.tr2", "synth.te", "synth.tr", "viruses.dat", "virus3.dat"]], "license": null, "citation": "Pattern Recognition and Neural Networks by B.D. Ripley (1996) Cambridge University Press ISBN 0 521 46986 7", "details": "Data sets from Brian Ripley's Pattern Recognition and Neural Networks", "urls": ["http://www.stats.ox.ac.uk/pub/PRNN/"], "size": 93565}, "three_phase_oil_flow": {"files": [["DataTrnLbls.txt", "DataTrn.txt", "DataTst.txt", "DataTstLbls.txt", "DataVdn.txt", "DataVdnLbls.txt"]], "license": null, "citation": "Bishop, C. M. and G. D. James (1993). Analysis of multiphase flows using dual-energy gamma densitometry and neural networks. Nuclear Instruments and Methods in Physics Research A327, 580-593", "details": "The three phase oil data used initially for demonstrating the Generative Topographic mapping.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/three_phase_oil_flow/"], "size": 712796}, "robot_wireless": {"files": [["uw-floor.txt"]], "license": null, "citation": "WiFi-SLAM using Gaussian Process Latent Variable Models by Brian Ferris, Dieter Fox and Neil Lawrence in IJCAI'07 Proceedings pages 2480-2485. Data used in A Unifying Probabilistic Perspective for Spectral Dimensionality Reduction: Insights and New Models by Neil D. Lawrence, JMLR 13 pg 1609--1638, 2012.", "details": "Data created by Brian Ferris and Dieter Fox. Consists of WiFi access point strengths taken during a circuit of the Paul Allen building at the University of Washington.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/robot_wireless/"], "size": 284390}, "xw_pen": {"files": [["xw_pen_15.csv"]], "license": null, "citation": "Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005", "details": "Accelerometer pen data used for robust regression by Tipping and Lawrence.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/xw_pen/"], "size": 3410}, "swiss_roll": {"files": [["swiss_roll_data.mat"]], "license": null, "citation": "A Global Geometric Framework for Nonlinear Dimensionality Reduction, J. B. Tenenbaum, V. de Silva and J. C. Langford, Science 290 (5500): 2319-2323, 22 December 2000", "details": "Swiss roll data made available by Tenenbaum, de Silva and Langford to demonstrate isomap, available from http://isomap.stanford.edu/datasets.html.", "urls": ["http://isomap.stanford.edu/"], "size": 800256}, "osu_run1": {"files": [["run1TXT.ZIP"], ["connections.txt"]], "license": "Data is licensed under a Creative Commons Attribution-NonCommercial-ShareAlike 3.0 Unported License (http://creativecommons.org/licenses/by-nc-sa/3.0/).", "citation": "The Open Motion Data Project by The Ohio State University Advanced Computing Center for the Arts and Design, http://accad.osu.edu/research/mocap/mocap_data.htm.", "details": "Motion capture data of a stick man running from the Open Motion Data Project at Ohio State University.", "urls": ["http://accad.osu.edu/research/mocap/data/", "http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/stick/"], "size": 338103}, "creep_rupture": {"files": [["creeprupt.tar"]], "license": null, "citation": "Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.", "details": "Provides 2066 creep rupture test results of steels (mainly of two kinds of steels: 2.25Cr and 9-12 wt% Cr ferritic steels). See http://www.msm.cam.ac.uk/map/data/materials/creeprupt-b.html.", "urls": ["http://www.msm.cam.ac.uk/map/data/tar/"], "size": 602797}, "hapmap3": {"files": [["hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2", "hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2", "relationships_w_pops_121708.txt"]], "license": "International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)", "citation": "Gibbs, Richard A., et al. \"The international HapMap project.\" Nature 426.6968 (2003): 789-796.", "details": "HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.", "urls": ["http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/"], "size": 3458246739}, "olivetti_faces": {"files": [["att_faces.zip"], ["olivettifaces.mat"]], "license": null, "citation": "Ferdinando Samaria and Andy Harter, Parameterisation of a Stochastic Model for Human Face Identification. Proceedings of 2nd IEEE Workshop on Applications of Computer Vision, Sarasota FL, December 1994", "details": "Olivetti Research Labs Face data base, acquired between December 1992 and December 1994 in the Olivetti Research Lab, Cambridge (which later became AT&T Laboratories, Cambridge). When using these images please give credit to AT&T Laboratories, Cambridge. ", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/olivetti_faces/", "http://www.cs.nyu.edu/~roweis/data/"], "size": 8561331}, "della_gatta": {"files": [["DellaGattadata.mat"]], "license": null, "citation": "Direct targets of the TRP63 transcription factor revealed by a combination of gene expression profiling and reverse engineering. Giusy Della Gatta, Mukesh Bansal, Alberto Ambesi-Impiombato, Dario Antonini, Caterina Missero, and Diego di Bernardo, Genome Research 2008", "details": "The full gene expression data set from della Gatta et al (http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2413161/) processed by RMA.", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/della_gatta/"], "size": 3729650}, "epomeo_gpx": {"files": [["endomondo_1.gpx", "endomondo_2.gpx", "garmin_watch_via_endomondo.gpx", "viewranger_phone.gpx", "viewranger_tablet.gpx"]], "license": null, "citation": "", "details": "Five different GPS traces of the same run up Mount Epomeo in Ischia. The traces are from different sources. endomondo_1 and endomondo_2 are traces from the mobile phone app Endomondo, with a split in the middle. garmin_watch_via_endomondo is the trace from a Garmin watch, with a segment missing about 4 kilometers in. viewranger_phone and viewranger_tablet are traces from a phone and a tablet through the viewranger app. The viewranger_phone data comes from the same mobile phone as the Endomondo data (i.e. there are 3 GPS devices, but one device recorded two traces).", "urls": ["http://staffwww.dcs.shef.ac.uk/people/N.Lawrence/dataset_mirror/epomeo_gpx/"], "size": 2031872}} \ No newline at end of file diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index 04f09d3e..3f42055b 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -106,9 +106,30 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code)) elif response.code > 499: raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code)) - # if we wanted to get more sophisticated maybe we should check the response code here again even for successes. with open(save_name, 'wb') as f: - f.write(response.read()) + meta = response.info() + file_size = int(meta.getheaders("Content-Length")[0]) + status = "" + file_size_dl = 0 + block_sz = 8192 + line_length=30 + while True: + buff = response.read(block_sz) + if not buff: + break + file_size_dl += len(buff) + f.write(buff) + sys.stdout.write(" "*(len(status)) + "\r") + status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), + full=file_size/(1.*1e6), ll=line_length, + perc="="*int(line_length*float(file_size_dl)/file_size)) + sys.stdout.write(status) + sys.stdout.flush() + sys.stdout.write(" "*(len(status)) + "\r") + print status + # if we wanted to get more sophisticated maybe we should check the response code here again even for successes. + #with open(save_name, 'wb') as f: + # f.write(response.read()) #urllib.urlretrieve(url+suffix, save_name, reporthook) @@ -552,6 +573,151 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0): c = c[so, :] return {'Y':Y, 't':t, 'colors':c} +def hapmap3(data_set='hapmap3'): + """ + The HapMap phase three SNP dataset - 1184 samples out of 11 populations. + + SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: + Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then + + / 1, iff SNPij==(B1,B1) + Aij = | 0, iff SNPij==(B1,B2) + \ -1, iff SNPij==(B2,B2) + + The SNP data and the meta information (such as iid, sex and phenotype) are + stored in the dataframe datadf, index is the Individual ID, + with following columns for metainfo: + + * family_id -> Family ID + * paternal_id -> Paternal ID + * maternal_id -> Maternal ID + * sex -> Sex (1=male; 2=female; other=unknown) + * phenotype -> Phenotype (-9, or 0 for unknown) + * population -> Population string (e.g. 'ASW' - 'YRI') + * rest are SNP rs (ids) + + More information is given in infodf: + + * Chromosome: + - autosomal chromosemes -> 1-22 + - X X chromosome -> 23 + - Y Y chromosome -> 24 + - XY Pseudo-autosomal region of X -> 25 + - MT Mitochondrial -> 26 + * Relative Positon (to Chromosome) [base pairs] + """ + try: + from pandas import read_pickle, DataFrame + from sys import stdout + import bz2 + except ImportError as i: + raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset" + if not data_available(data_set): + download_data(data_set) + dirpath = os.path.join(data_path,'hapmap3') + hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly' + preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \ + ['.snps.pickle', + '.info.pickle', + '.nan.pickle']] + if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)): + if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB " + "of memory and can take a (very) long time, continue? [Y/n]"): + print "Preprocessing required for further usage." + return + status = "Preprocessing data, please be patient..." + print status + def write_status(message, progress, status): + stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush() + status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20, + perc="="*int(20.*progress/100.)) + stdout.write(status); stdout.flush() + return status + unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']] + if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)): + status=write_status('unpacking...', 0, '') + curr = 0 + for newfilepath in unpacked_files: + if not os.path.exists(newfilepath): + filepath = newfilepath + '.bz2' + file_size = os.path.getsize(filepath) + with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f: + decomp = bz2.BZ2Decompressor() + file_processed = 0 + buffsize = 100 * 1024 + for data in iter(lambda : f.read(buffsize), b''): + new_file.write(decomp.decompress(data)) + file_processed += len(data) + status=write_status('unpacking...', curr+12.*file_processed/(file_size), status) + curr += 12 + status=write_status('unpacking...', curr, status) + status=write_status('reading .ped...', 25, status) + # Preprocess data: + snpstrnp = np.loadtxt(unpacked_files[0], dtype=str) + status=write_status('reading .map...', 33, status) + mapnp = np.loadtxt(unpacked_files[1], dtype=str) + status=write_status('reading relationships.txt...', 42, status) + # and metainfo: + infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t') + infodf.set_index('IID', inplace=1) + status=write_status('filtering nan...', 45, status) + snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2) + inan = snpstr[:,:,0] == '0' + status=write_status('filtering reference alleles...', 55, status) + ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:])) + status=write_status('encoding snps...', 70, status) + # Encode the information for each gene in {-1,0,1}: + status=write_status('encoding snps...', 73, status) + snps = (snpstr==ref[None,:,:]) + status=write_status('encoding snps...', 76, status) + snps = (snps*np.array([1,-1])[None,None,:]) + status=write_status('encoding snps...', 78, status) + snps = snps.sum(-1) + status=write_status('encoding snps...', 81, status) + snps = snps.astype('i8') + status=write_status('marking nan values...', 88, status) + # put in nan values (masked as -128): + snps[inan] = -128 + status=write_status('setting up meta...', 94, status) + # get meta information: + metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']] + metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6]) + metadf.set_index('iid', inplace=1) + metadf = metadf.join(infodf.population) + metadf.to_pickle(preprocessed_data_paths[1]) + # put everything together: + status=write_status('setting up snps...', 96, status) + snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1]) + with open(preprocessed_data_paths[0], 'wb') as f: + pickle.dump(f, snpsdf, protocoll=-1) + status=write_status('setting up snps...', 98, status) + inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1]) + inandf.to_pickle(preprocessed_data_paths[2]) + status=write_status('done :)', 100, status) + print '' + else: + print "loading snps..." + snpsdf = read_pickle(preprocessed_data_paths[0]) + print "loading metainfo..." + metadf = read_pickle(preprocessed_data_paths[1]) + print "loading nan entries..." + inandf = read_pickle(preprocessed_data_paths[2]) + snps = snpsdf.values + populations = metadf.population.values.astype('S3') + hapmap = dict(name=data_set, + description='The HapMap phase three SNP dataset - ' + '1184 samples out of 11 populations. inan is a ' + 'boolean array, containing wheather or not the ' + 'given entry is nan (nans are masked as ' + '-128 in snps).', + snpsdf=snpsdf, + metadf=metadf, + snps=snps, + inan=inandf.values, + inandf=inandf, + populations=populations) + return hapmap + def swiss_roll_1000(): return swiss_roll(num_samples=1000) diff --git a/GPy/util/datasets/data_resources_create.py b/GPy/util/datasets/data_resources_create.py index 8ae62a85..4e7c3524 100644 --- a/GPy/util/datasets/data_resources_create.py +++ b/GPy/util/datasets/data_resources_create.py @@ -24,12 +24,12 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'], 'license': None, 'size' : 1100584}, 'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'], - 'files' : [['allasfamc.zip']], - 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu. -The database was created with funding from NSF EIA-0196217.""", - 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""", - 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""", - 'size' : None}, + 'files' : [['allasfamc.zip']], + 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.' + 'The database was created with funding from NSF EIA-0196217.""", + 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""", + 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""", + 'size' : None}, 'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'], 'files' : [['creeprupt.tar']], 'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.', @@ -120,8 +120,49 @@ The database was created with funding from NSF EIA-0196217.""", 'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""", 'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005', 'license' : None, - 'size' : 3410} + 'size' : 3410}, + 'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'], + 'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']], + 'details' : """ + HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. + The HapMap phase three SNP dataset - 1184 samples out of 11 populations. + See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details. + + SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]: + Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then + + / 1, iff SNPij==(B1,B1) + Aij = | 0, iff SNPij==(B1,B2) + \ -1, iff SNPij==(B2,B2) + + The SNP data and the meta information (such as iid, sex and phenotype) are + stored in the dataframe datadf, index is the Individual ID, + with following columns for metainfo: + + * family_id -> Family ID + * paternal_id -> Paternal ID + * maternal_id -> Maternal ID + * sex -> Sex (1=male; 2=female; other=unknown) + * phenotype -> Phenotype (-9, or 0 for unknown) + * population -> Population string (e.g. 'ASW' - 'YRI') + * rest are SNP rs (ids) + + More information is given in infodf: + + * Chromosome: + - autosomal chromosemes -> 1-22 + - X X chromosome -> 23 + - Y Y chromosome -> 24 + - XY Pseudo-autosomal region of X -> 25 + - MT Mitochondrial -> 26 + * Relative Positon (to Chromosome) [base pairs] + + """, + 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""", + 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""", + 'size' : 2*1729092237 + 62265}, } -with open('data_resources.json', 'w') as file: - json.dump(data_resources, file) +with open('data_resources.json', 'w') as f: + print "writing data_resources" + json.dump(data_resources, f)