Merge branch 'params' of https://github.com/SheffieldML/GPy into params

2026-05-09 20:12:38 +02:00 · 2014-04-17 07:05:20 -04:00 · 2014-04-17 07:05:20 -04:00 · 483cb7ddc0
commit 483cb7ddc0
parent 583f3bef0a 22501c9441
15 changed files with 376 additions and 495 deletions
--- a/GPy/core/parameterization/observable_array.py
+++ b/GPy/core/parameterization/observable_array.py
@ -1,7 +1,7 @@
 # Copyright (c) 2012, GPy authors (see AUTHORS.txt).
 # Licensed under the BSD 3-clause license (see LICENSE.txt)

-__updated__ = '2014-03-31'
+__updated__ = '2014-04-15'

 import numpy as np
 from parameter_core import Observable, Pickleable
--- a/GPy/core/parameterization/param.py
+++ b/GPy/core/parameterization/param.py
@ -3,6 +3,7 @@

 import itertools
 import numpy
+np = numpy
 from parameter_core import OptimizationHandlable, adjust_name_for_printing
 from observable_array import ObsAr

@ -118,10 +119,6 @@ class Param(OptimizationHandlable, ObsAr):
        except AttributeError: pass  # returning 0d array or float, double etc
        return new_arr

-    def __setitem__(self, s, val):
-        super(Param, self).__setitem__(s, val)
-
-
    def _raveled_index(self, slice_index=None):
        # return an index array on the raveled array, which is formed by the current_slice
        # of this object
@ -311,15 +308,15 @@ class ParamConcatenation(object):
    #===========================================================================
    def __getitem__(self, s):
        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
-        params = [p.param_array[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p.param_array[ind[ps]])]
+        params = [p.param_array.flat[ind[ps]] for p,ps in zip(self.params, self._param_slices_) if numpy.any(p.param_array.flat[ind[ps]])]
        if len(params)==1: return params[0]
        return ParamConcatenation(params)
    def __setitem__(self, s, val, update=True):
        if isinstance(val, ParamConcatenation):
            val = val.values()
        ind = numpy.zeros(sum(self._param_sizes), dtype=bool); ind[s] = True;
-        vals = self.values(); vals[s] = val; del val
-        [numpy.place(p, ind[ps], vals[ps])
+        vals = self.values(); vals[s] = val
+        [numpy.copyto(p, vals[ps], where=ind[ps])
         for p, ps in zip(self.params, self._param_slices_)]
        if update:
            self.update_all_params()
@ -342,8 +339,8 @@ class ParamConcatenation(object):
        self.update_all_params()
    constrain_positive.__doc__ = Param.constrain_positive.__doc__

-    def constrain_fixed(self, warning=True):
-        [param.constrain_fixed(warning) for param in self.params]
+    def constrain_fixed(self, value=None, warning=True, trigger_parent=True):
+        [param.constrain_fixed(value, warning, trigger_parent) for param in self.params]
    constrain_fixed.__doc__ = Param.constrain_fixed.__doc__
    fix = constrain_fixed

@ -411,3 +408,42 @@ class ParamConcatenation(object):
        return "\n".join(strings)
    def __repr__(self):
        return "\n".join(map(repr,self.params))
+
+    def __ilshift__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ilshift__(self.values(), *args, **kwargs)
+
+    def __irshift__(self, *args, **kwargs):
+        self[:] = np.ndarray.__irshift__(self.values(), *args, **kwargs)
+
+    def __ixor__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ixor__(self.values(), *args, **kwargs)
+
+    def __ipow__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ipow__(self.values(), *args, **kwargs)
+
+    def __ifloordiv__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ifloordiv__(self.values(), *args, **kwargs)
+
+    def __isub__(self, *args, **kwargs):
+        self[:] = np.ndarray.__isub__(self.values(), *args, **kwargs)
+
+    def __ior__(self, *args, **kwargs):
+        self[:] = np.ndarray.__ior__(self.values(), *args, **kwargs)
+
+    def __itruediv__(self, *args, **kwargs):
+        self[:] = np.ndarray.__itruediv__(self.values(), *args, **kwargs)
+
+    def __idiv__(self, *args, **kwargs):
+        self[:] = np.ndarray.__idiv__(self.values(), *args, **kwargs)
+
+    def __iand__(self, *args, **kwargs):
+        self[:] = np.ndarray.__iand__(self.values(), *args, **kwargs)
+
+    def __imod__(self, *args, **kwargs):
+        self[:] = np.ndarray.__imod__(self.values(), *args, **kwargs)
+
+    def __iadd__(self, *args, **kwargs):
+        self[:] = np.ndarray.__iadd__(self.values(), *args, **kwargs)
+
+    def __imul__(self, *args, **kwargs):
+        self[:] = np.ndarray.__imul__(self.values(), *args, **kwargs)
--- a/GPy/core/parameterization/parameter_core.py
+++ b/GPy/core/parameterization/parameter_core.py
@ -15,8 +15,9 @@ Observable Pattern for patameterization

 from transformations import Logexp, NegativeLogexp, Logistic, __fixed__, FIXED, UNFIXED
 import numpy as np
+import re

-__updated__ = '2014-03-31'
+__updated__ = '2014-04-16'

 class HierarchyError(Exception):
    """
@ -28,7 +29,15 @@ def adjust_name_for_printing(name):
    Make sure a name can be printed, alongside used as a variable name.
    """
    if name is not None:
-        return name.replace(" ", "_").replace(".", "_").replace("-", "_m_").replace("+", "_p_").replace("!", "_I_").replace("**", "_xx_").replace("*", "_x_").replace("/", "_l_").replace("@", '_at_')
+        name2 = name
+        name = name.replace(" ", "_").replace(".", "_").replace("-", "_m_")
+        name = name.replace("+", "_p_").replace("!", "_I_")
+        name = name.replace("**", "_xx_").replace("*", "_x_")
+        name = name.replace("/", "_l_").replace("@", '_at_')
+        name = name.replace("(", "_of_").replace(")", "")
+        if re.match(r'^[a-zA-Z_][a-zA-Z0-9-_]*$', name) is None:
+            raise NameError, "name {} converted to {} cannot be further converted to valid python variable name!".format(name2, name)
+        return name
    return ''


@ -458,7 +467,7 @@ class Constrainable(Nameable, Indexable, Observable):
        Constrain the parameter to the given
        :py:class:`GPy.core.transformations.Transformation`.
        """
-        self.param_array[:] = transform.initialize(self.param_array)
+        self.param_array[...] = transform.initialize(self.param_array)
        reconstrained = self.unconstrain()
        self._add_to_index_operations(self.constraints, reconstrained, transform, warning)
        self.notify_observers(self, None if trigger_parent else -np.inf)
--- a/GPy/core/parameterization/parameterized.py
+++ b/GPy/core/parameterization/parameterized.py
@ -185,6 +185,8 @@ class Parameterized(Parameterizable):
            return ParamConcatenation(paramlist)

    def __setitem__(self, name, value, paramlist=None):
+        if value is None:
+            return # nothing to do here
        if isinstance(name, (slice, tuple, np.ndarray)):
            try:
                self.param_array[name] = value
@ -197,8 +199,8 @@ class Parameterized(Parameterizable):
            param[:] = value

    def __setattr__(self, name, val):
-        # override the default behaviour, if setting a param, so broadcasting can by used        
-        if hasattr(self, '_parameters_'):
+        # override the default behaviour, if setting a param, so broadcasting can by used
+        if hasattr(self, "_parameters_"):
            pnames = self.parameter_names(False, adjust_for_printing=True, recursive=False)
            if name in pnames: self._parameters_[pnames.index(name)][:] = val; return
        object.__setattr__(self, name, val);
--- a/GPy/inference/latent_function_inference/var_dtc.py
+++ b/GPy/inference/latent_function_inference/var_dtc.py
@ -192,17 +192,22 @@ class VarDTC(object):

 class VarDTCMissingData(object):
    const_jitter = 1e-6
-    def __init__(self, limit=1):
+    def __init__(self, limit=1, inan=None):
        from ...util.caching import Cacher
        self._Y = Cacher(self._subarray_computations, limit)
+        self._inan = inan
        pass

    def set_limit(self, limit):
        self._Y.limit = limit

    def _subarray_computations(self, Y):
-        inan = np.isnan(Y)
-        has_none = inan.any()
+        if self._inan is None:
+            inan = np.isnan(Y)
+            has_none = inan.any()
+        else:
+            inan = self._inan
+            has_none = True
        if has_none:
            from ...util.subarray_and_sorting import common_subarrays
            self._subarray_indices = []
--- a/GPy/kern/_src/kern.py
+++ b/GPy/kern/_src/kern.py
@ -13,7 +13,7 @@ class Kern(Parameterized):
    #===========================================================================
    # This adds input slice support. The rather ugly code for slicing can be
    # found in kernel_slice_operations
-    #__metaclass__ = KernCallsViaSlicerMeta
+    __metaclass__ = KernCallsViaSlicerMeta
    #===========================================================================
    _support_GPU=False
    def __init__(self, input_dim, active_dims, name, useGPU=False, *a, **kw):
@ -21,26 +21,50 @@ class Kern(Parameterized):
        The base class for a kernel: a positive definite function
        which forms of a covariance function (kernel).

+        input_dim:
+
+            is the number of dimensions to work on. Make sure to give the 
+            tight dimensionality of inputs.
+            You most likely want this to be the integer telling the number of 
+            input dimensions of the kernel.
+            If this is not an integer (!) we will work on the whole input matrix X,
+            and not check whether dimensions match or not (!).
+
+        active_dims:
+
+            is the active_dimensions of inputs X we will work on.
+            All kernels will get sliced Xes as inputs, if active_dims is not None
+            if active_dims is None, slicing is switched off and all X will be passed through as given.
+
        :param int input_dim: the number of input dimensions to the function
-        :param array-like|slice active_dims: list of indices on which dimensions this kernel works on
+        :param array-like|slice|None active_dims: list of indices on which dimensions this kernel works on, or none if no slicing

        Do not instantiate.
        """
        super(Kern, self).__init__(name=name, *a, **kw)
-        self.active_dims = active_dims if active_dims is not None else slice(0, input_dim)
-        self.input_dim = input_dim
-        assert isinstance(self.active_dims, (slice, list, tuple, np.ndarray)), 'active_dims needs to be an array-like or slice object over dimensions, {} given'.format(self.active_dims.__class__)
-        if isinstance(self.active_dims, slice):
-            self.active_dims = slice(self.active_dims.start or 0, self.active_dims.stop or self.input_dim, self.active_dims.step or 1)
-            active_dim_size = int(np.round((self.active_dims.stop-self.active_dims.start)/self.active_dims.step))
-        elif isinstance(self.active_dims, np.ndarray):
-            assert self.active_dims.ndim == 1, 'only flat indices allowed, given active_dims.shape={}, provide only indexes to the dimensions of the input'.format(self.active_dims.shape)
-            active_dim_size = self.active_dims.size
-        else:
-            active_dim_size = len(self.active_dims)
-        assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims)
+        try:
+            self.input_dim = int(input_dim)
+            self.active_dims = active_dims# if active_dims is not None else slice(0, input_dim, 1)
+        except TypeError:
+            # input_dim is something else then an integer
+            self.input_dim = input_dim
+            if active_dims is not None:
+                print "WARNING: given input_dim={} is not an integer and active_dims={} is given, switching off slicing"
+            self.active_dims = None
+
+        if self.active_dims is not None and self.input_dim is not None:
+            assert isinstance(self.active_dims, (slice, list, tuple, np.ndarray)), 'active_dims needs to be an array-like or slice object over dimensions, {} given'.format(self.active_dims.__class__)
+            if isinstance(self.active_dims, slice):
+                self.active_dims = slice(self.active_dims.start or 0, self.active_dims.stop or self.input_dim, self.active_dims.step or 1)
+                active_dim_size = int(np.round((self.active_dims.stop-self.active_dims.start)/self.active_dims.step))
+            elif isinstance(self.active_dims, np.ndarray):
+                #assert np.all(self.active_dims >= 0), 'active dimensions need to be positive. negative indexing is not allowed'
+                assert self.active_dims.ndim == 1, 'only flat indices allowed, given active_dims.shape={}, provide only indexes to the dimensions (columns) of the input'.format(self.active_dims.shape)
+                active_dim_size = self.active_dims.size
+            else:
+                active_dim_size = len(self.active_dims)
+            assert active_dim_size == self.input_dim, "input_dim={} does not match len(active_dim)={}, active_dims={}".format(self.input_dim, active_dim_size, self.active_dims)
        self._sliced_X = 0
-        
        self.useGPU = self._support_GPU and useGPU

    @Cache_this(limit=10)
@ -205,9 +229,12 @@ class CombinationKernel(Kern):
        return self._parameters_

    def get_input_dim_active_dims(self, kernels, extra_dims = None):
-        active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
-        input_dim = active_dims.max()+1 + (len(np.r_[extra_dims]) if extra_dims is not None else 0)
-        active_dims = slice(0, input_dim, 1)
+        #active_dims = reduce(np.union1d, (np.r_[x.active_dims] for x in kernels), np.array([], dtype=int))
+        #active_dims = np.array(np.concatenate((active_dims, extra_dims if extra_dims is not None else [])), dtype=int)
+        input_dim = np.array([k.input_dim for k in kernels])
+        if np.all(input_dim[0]==input_dim):
+            input_dim = input_dim[0]
+        active_dims = None
        return input_dim, active_dims

    def input_sensitivity(self):
--- a/GPy/kern/_src/kernel_slice_operations.py
+++ b/GPy/kern/_src/kernel_slice_operations.py
@ -33,8 +33,11 @@ class _Slice_wrap(object):
    def __init__(self, k, X, X2=None):
        self.k = k
        self.shape = X.shape
-        if self.k._sliced_X == 0:
-            assert X.shape[1] > max(np.r_[self.k.active_dims]), "At least {} dimensional X needed".format(max(np.r_[self.k.active_dims]))
+        assert X.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X.shape={!s}".format(X.shape)
+        if X2 is not None:
+            assert X2.ndim == 2, "only matrices are allowed as inputs to kernels for now, given X2.shape={!s}".format(X2.shape)
+        if (self.k.active_dims is not None) and (self.k._sliced_X == 0):
+            assert X.shape[1] >= len(np.r_[self.k.active_dims]), "At least {} dimensional X needed, X.shape={!s}".format(len(np.r_[self.k.active_dims]), X.shape)
            self.X = self.k._slice_X(X)
            self.X2 = self.k._slice_X(X2) if X2 is not None else X2
            self.ret = True
--- a/GPy/kern/_src/stationary.py
+++ b/GPy/kern/_src/stationary.py
@ -139,7 +139,7 @@ class Stationary(Kern):
            #self.lengthscale.gradient = -((dL_dr*rinv)[:,:,None]*x_xl3).sum(0).sum(0)/self.lengthscale**3
            tmp = dL_dr*self._inv_dist(X, X2)
            if X2 is None: X2 = X
-            self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(self._slice_X(X)[:,q:q+1] - self._slice_X(X2)[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
+            self.lengthscale.gradient = np.array([np.einsum('ij,ij,...', tmp, np.square(X[:,q:q+1] - X2[:,q:q+1].T), -1./self.lengthscale[q]**3) for q in xrange(self.input_dim)])
        else:
            r = self._scaled_dist(X, X2)
            self.lengthscale.gradient = -np.sum(dL_dr*r)/self.lengthscale
--- a/GPy/testing/kernel_tests.py
+++ b/GPy/testing/kernel_tests.py
@ -9,8 +9,6 @@ from GPy.core.parameterization.param import Param

 verbose = 0

-np.random.seed(50)
-

 class Kern_check_model(GPy.core.Model):
    """
@ -260,7 +258,6 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))

    def test_Prod3(self):
-        k = GPy.kern.Matern32(2, active_dims=[2,3]) * (GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D))
        k = (GPy.kern.RBF(2, active_dims=[0,4]) * GPy.kern.Linear(self.D))
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
@ -274,7 +271,7 @@ class KernelGradientTestsContinuous(unittest.TestCase):
    def test_Add_dims(self):
        k = GPy.kern.Matern32(2, active_dims=[2,self.D]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)
        k.randomize()
-        self.assertRaises(AssertionError, k.K, self.X)
+        self.assertRaises(IndexError, k.K, self.X)
        k = GPy.kern.Matern32(2, active_dims=[2,self.D-1]) + GPy.kern.RBF(2, active_dims=[0,4]) + GPy.kern.Linear(self.D)
        k.randomize()
        # assert it runs:
@ -303,46 +300,26 @@ class KernelGradientTestsContinuous(unittest.TestCase):
        k.randomize()
        self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))

-#TODO: turn off grad checkingwrt X for indexed kernels like coregionalize
-# class KernelGradientTestsContinuous1D(unittest.TestCase):
-#     def setUp(self):
-#         self.N, self.D = 100, 1
-#         self.X = np.random.randn(self.N,self.D)
-#         self.X2 = np.random.randn(self.N+10,self.D)
-#
-#         continuous_kerns = ['RBF', 'Linear']
-#         self.kernclasses = [getattr(GPy.kern, s) for s in continuous_kerns]
-#
-#     def test_PeriodicExponential(self):
-#         k = GPy.kern.PeriodicExponential(self.D)
-#         k.randomize()
-#         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
-#
-#     def test_PeriodicMatern32(self):
-#         k = GPy.kern.PeriodicMatern32(self.D)
-#         k.randomize()
-#         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
-#
-#     def test_PeriodicMatern52(self):
-#         k = GPy.kern.PeriodicMatern52(self.D)
-#         k.randomize()
-#         self.assertTrue(check_kernel_gradient_functions(k, X=self.X, X2=self.X2, verbose=verbose))
-
-
 class KernelTestsMiscellaneous(unittest.TestCase):
    def setUp(self):
        N, D = 100, 10
-        self.X = np.linspace(-np.pi, +np.pi, N)[:,None] * np.ones(D)
+        self.X = np.linspace(-np.pi, +np.pi, N)[:,None] * np.random.uniform(-10,10,D)
        self.rbf = GPy.kern.RBF(2, active_dims=slice(0,4,2))
        self.linear = GPy.kern.Linear(2, active_dims=(3,9))
-        self.matern = GPy.kern.Matern32(3, active_dims=np.array([2,4,9]))
+        self.matern = GPy.kern.Matern32(3, active_dims=np.array([1,7,9]))
        self.sumkern = self.rbf + self.linear
        self.sumkern += self.matern
        self.sumkern.randomize()

    def test_active_dims(self):
-        self.assertEqual(self.sumkern.input_dim, 10)
-        self.assertEqual(self.sumkern.active_dims, slice(0, 10, 1))
+        # test the automatic dim detection expression for slices:
+        start, stop = 0, 277
+        for i in range(start,stop,7):
+            for j in range(1,4):
+                GPy.kern.Kern(int(np.round((i+1)/j)), slice(0, i+1, j), "testkern")
+        # test the ability to have only one dim
+        sk = GPy.kern.RBF(2) + GPy.kern.Matern32(2)
+        self.assertEqual(sk.input_dim, 2)

    def test_which_parts(self):
        self.assertTrue(np.allclose(self.sumkern.K(self.X, which_parts=[self.linear, self.matern]), self.linear.K(self.X)+self.matern.K(self.X)))
@ -365,7 +342,7 @@ class KernelTestsNonContinuous(unittest.TestCase):
        self.X2 = np.random.randn((N0+N1)*2, self.D+1)
        self.X2[:(N0*2), -1] = 0
        self.X2[(N0*2):, -1] = 1
-  
+
    def test_IndependentOutputs(self):
        k = GPy.kern.RBF(self.D)
        kern = GPy.kern.IndependentOutputs(k, -1, 'ind_single')
@ -373,7 +350,7 @@ class KernelTestsNonContinuous(unittest.TestCase):
        k = [GPy.kern.RBF(1, active_dims=[1], name='rbf1'), GPy.kern.RBF(self.D, name='rbf012'), GPy.kern.RBF(2, active_dims=[0,2], name='rbf02')]
        kern = GPy.kern.IndependentOutputs(k, -1, name='ind_split')
        self.assertTrue(check_kernel_gradient_functions(kern, X=self.X, X2=self.X2, verbose=verbose, fixed_X_dims=-1))
-  
+
    def test_ODE_UY(self):
        kern = GPy.kern.ODE_UY(2, active_dims=[0, self.D])
        X = self.X[self.X[:,-1]!=2]
--- a/GPy/testing/model_tests.py
+++ b/GPy/testing/model_tests.py
@ -130,6 +130,17 @@ class MiscTests(unittest.TestCase):
        m2.kern[:] = m.kern[''].values()
        np.testing.assert_equal(m.log_likelihood(), m2.log_likelihood())

+    def test_model_set_params(self):
+        m = GPy.models.GPRegression(self.X, self.Y)
+        lengthscale = np.random.uniform()
+        m.kern.lengthscale = lengthscale
+        np.testing.assert_equal(m.kern.lengthscale, lengthscale)
+        m.kern.lengthscale *= 1
+        m['.*var'] -= .1
+        np.testing.assert_equal(m.kern.lengthscale, lengthscale)
+        m.optimize()
+        print m
+
    def test_model_optimize(self):
        X = np.random.uniform(-3., 3., (20, 1))
        Y = np.sin(X) + np.random.randn(20, 1) * 0.05
--- a/GPy/testing/parameterized_tests.py
+++ b/GPy/testing/parameterized_tests.py
@ -142,6 +142,17 @@ class ParameterizedTest(unittest.TestCase):
        self.testmodel.randomize()
        self.assertEqual(val, self.testmodel.kern.lengthscale)

+    def test_regular_expression_misc(self):
+        self.testmodel.kern.lengthscale.fix()
+        val = float(self.testmodel.kern.lengthscale)
+        self.testmodel.randomize()
+        self.assertEqual(val, self.testmodel.kern.lengthscale)
+
+        variances = self.testmodel['.*var'].values()
+        self.testmodel['.*var'].fix()
+        self.testmodel.randomize()
+        np.testing.assert_equal(variances, self.testmodel['.*var'].values())
+
    def test_printing(self):
        print self.test1
        print self.param
--- a/GPy/testing/pickle_tests.py
+++ b/GPy/testing/pickle_tests.py
@ -185,6 +185,7 @@ class Test(ListDictTestCase):
    def _callback(self, what, which):
        what.count += 1

+    @unittest.skip
    def test_add_observer(self):
        par = toy_rbf_1d_50(optimize=0, plot=0)
        par.name = "original"
--- a/GPy/util/data_resources.json
+++ b/GPy/util/data_resources.json
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -106,9 +106,30 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix
            raise ValueError('Tried url ' + url + suffix + ' and received client error ' + str(response.code))
        elif response.code > 499:
            raise ValueError('Tried url ' + url + suffix + ' and received server error ' + str(response.code))
-    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
    with open(save_name, 'wb') as f:
-        f.write(response.read())
+        meta = response.info()
+        file_size = int(meta.getheaders("Content-Length")[0])
+        status = ""
+        file_size_dl = 0
+        block_sz = 8192
+        line_length=30
+        while True:
+            buff = response.read(block_sz)
+            if not buff:
+                break
+            file_size_dl += len(buff)
+            f.write(buff)
+            sys.stdout.write(" "*(len(status)) + "\r")
+            status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1.*1e6), 
+                                                                       full=file_size/(1.*1e6), ll=line_length, 
+                                                                       perc="="*int(line_length*float(file_size_dl)/file_size))
+            sys.stdout.write(status)
+            sys.stdout.flush()
+        sys.stdout.write(" "*(len(status)) + "\r")
+        print status
+    # if we wanted to get more sophisticated maybe we should check the response code here again even for successes.
+    #with open(save_name, 'wb') as f:
+    #    f.write(response.read())

    #urllib.urlretrieve(url+suffix, save_name, reporthook)

@ -552,6 +573,151 @@ def swiss_roll_generated(num_samples=1000, sigma=0.0):
    c = c[so, :]
    return {'Y':Y, 't':t, 'colors':c}

+def hapmap3(data_set='hapmap3'):
+    """
+    The HapMap phase three SNP dataset - 1184 samples out of 11 populations.
+
+    SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:
+    Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then
+
+          /  1, iff SNPij==(B1,B1)
+    Aij = |  0, iff SNPij==(B1,B2)
+          \ -1, iff SNPij==(B2,B2)
+
+    The SNP data and the meta information (such as iid, sex and phenotype) are
+    stored in the dataframe datadf, index is the Individual ID, 
+    with following columns for metainfo:
+
+        * family_id   -> Family ID
+        * paternal_id -> Paternal ID
+        * maternal_id -> Maternal ID
+        * sex         -> Sex (1=male; 2=female; other=unknown)
+        * phenotype   -> Phenotype (-9, or 0 for unknown)
+        * population  -> Population string (e.g. 'ASW' - 'YRI')
+        * rest are SNP rs (ids)
+
+    More information is given in infodf:
+
+        * Chromosome:
+            - autosomal chromosemes                -> 1-22
+            - X    X chromosome                    -> 23
+            - Y    Y chromosome                    -> 24
+            - XY   Pseudo-autosomal region of X    -> 25
+            - MT   Mitochondrial                   -> 26
+        * Relative Positon (to Chromosome) [base pairs]
+    """
+    try:
+        from pandas import read_pickle, DataFrame
+        from sys import stdout
+        import bz2
+    except ImportError as i:
+        raise i, "Need pandas for hapmap dataset, make sure to install pandas (http://pandas.pydata.org/) before loading the hapmap dataset"
+    if not data_available(data_set):
+        download_data(data_set)
+    dirpath = os.path.join(data_path,'hapmap3')
+    hapmap_file_name = 'hapmap3_r2_b36_fwd.consensus.qc.poly'
+    preprocessed_data_paths = [os.path.join(dirpath,hapmap_file_name + file_name) for file_name in \
+                               ['.snps.pickle',
+                                '.info.pickle',
+                                '.nan.pickle']]
+    if not reduce(lambda a,b: a and b, map(os.path.exists, preprocessed_data_paths)):
+        if not overide_manual_authorize and not prompt_user("Preprocessing requires ~25GB "
+                            "of memory and can take a (very) long time, continue? [Y/n]"):
+            print "Preprocessing required for further usage."
+            return
+        status = "Preprocessing data, please be patient..."
+        print status
+        def write_status(message, progress, status):
+            stdout.write(" "*len(status)); stdout.write("\r"); stdout.flush()
+            status = r"[{perc: <{ll}}] {message: <13s}".format(message=message, ll=20,
+                                                               perc="="*int(20.*progress/100.))
+            stdout.write(status); stdout.flush()
+            return status
+        unpacked_files = [os.path.join(dirpath, hapmap_file_name+ending) for ending in ['.ped', '.map']]
+        if not reduce(lambda a,b: a and b, map(os.path.exists, unpacked_files)):
+            status=write_status('unpacking...', 0, '')
+            curr = 0
+            for newfilepath in unpacked_files:
+                if not os.path.exists(newfilepath):
+                    filepath = newfilepath + '.bz2'
+                    file_size = os.path.getsize(filepath)
+                    with open(newfilepath, 'wb') as new_file, open(filepath, 'rb') as f:
+                        decomp = bz2.BZ2Decompressor()
+                        file_processed = 0
+                        buffsize = 100 * 1024
+                        for data in iter(lambda : f.read(buffsize), b''):
+                            new_file.write(decomp.decompress(data))
+                            file_processed += len(data)
+                            status=write_status('unpacking...', curr+12.*file_processed/(file_size), status)
+                curr += 12
+                status=write_status('unpacking...', curr, status)
+        status=write_status('reading .ped...', 25, status)
+        # Preprocess data:    
+        snpstrnp = np.loadtxt(unpacked_files[0], dtype=str)
+        status=write_status('reading .map...', 33, status)
+        mapnp = np.loadtxt(unpacked_files[1], dtype=str)
+        status=write_status('reading relationships.txt...', 42, status)
+        # and metainfo:
+        infodf = DataFrame.from_csv(os.path.join(dirpath,'./relationships_w_pops_121708.txt'), header=0, sep='\t')
+        infodf.set_index('IID', inplace=1)
+        status=write_status('filtering nan...', 45, status)
+        snpstr = snpstrnp[:,6:].astype('S1').reshape(snpstrnp.shape[0], -1, 2)
+        inan = snpstr[:,:,0] == '0'
+        status=write_status('filtering reference alleles...', 55, status)
+        ref = np.array(map(lambda x: np.unique(x)[-2:], snpstr.swapaxes(0,1)[:,:,:]))
+        status=write_status('encoding snps...', 70, status)
+        # Encode the information for each gene in {-1,0,1}:
+        status=write_status('encoding snps...', 73, status)
+        snps = (snpstr==ref[None,:,:])
+        status=write_status('encoding snps...', 76, status)
+        snps = (snps*np.array([1,-1])[None,None,:])
+        status=write_status('encoding snps...', 78, status)
+        snps = snps.sum(-1)
+        status=write_status('encoding snps...', 81, status)
+        snps = snps.astype('i8')
+        status=write_status('marking nan values...', 88, status)
+        # put in nan values (masked as -128):
+        snps[inan] = -128
+        status=write_status('setting up meta...', 94, status)
+        # get meta information:
+        metaheader = np.r_[['family_id', 'iid', 'paternal_id', 'maternal_id', 'sex', 'phenotype']]
+        metadf = DataFrame(columns=metaheader, data=snpstrnp[:,:6])
+        metadf.set_index('iid', inplace=1)
+        metadf = metadf.join(infodf.population)
+        metadf.to_pickle(preprocessed_data_paths[1])
+        # put everything together:
+        status=write_status('setting up snps...', 96, status)
+        snpsdf = DataFrame(index=metadf.index, data=snps, columns=mapnp[:,1])
+        with open(preprocessed_data_paths[0], 'wb') as f:
+            pickle.dump(f, snpsdf, protocoll=-1)
+        status=write_status('setting up snps...', 98, status)
+        inandf = DataFrame(index=metadf.index, data=inan, columns=mapnp[:,1])
+        inandf.to_pickle(preprocessed_data_paths[2])
+        status=write_status('done :)', 100, status)
+        print ''
+    else:
+        print "loading snps..."
+        snpsdf = read_pickle(preprocessed_data_paths[0])
+        print "loading metainfo..."
+        metadf = read_pickle(preprocessed_data_paths[1])
+        print "loading nan entries..."
+        inandf = read_pickle(preprocessed_data_paths[2])
+    snps = snpsdf.values
+    populations = metadf.population.values.astype('S3')
+    hapmap = dict(name=data_set,
+                  description='The HapMap phase three SNP dataset - '
+                  '1184 samples out of 11 populations. inan is a '
+                  'boolean array, containing wheather or not the '
+                  'given entry is nan (nans are masked as '
+                  '-128 in snps).',
+                  snpsdf=snpsdf,
+                  metadf=metadf,
+                  snps=snps,
+                  inan=inandf.values,
+                  inandf=inandf,
+                  populations=populations)
+    return hapmap
+    
 def swiss_roll_1000():
    return swiss_roll(num_samples=1000)

--- a/GPy/util/datasets/data_resources_create.py
+++ b/GPy/util/datasets/data_resources_create.py
@ -24,12 +24,12 @@ data_resources = {'ankur_pose_data' : {'urls' : [neil_url + 'ankur_pose_data/'],
                                     'license': None,
                                     'size' : 1100584},
                  'cmu_mocap_full' : {'urls' : ['http://mocap.cs.cmu.edu'],
-                                 'files' : [['allasfamc.zip']],
-                                 'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.
-The database was created with funding from NSF EIA-0196217.""",
-                                 'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
-                                 'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
-                                 'size' : None},
+                                      'files' : [['allasfamc.zip']],
+                                      'citation' : """Please include this in your acknowledgements: The data used in this project was obtained from mocap.cs.cmu.edu.'
+                                      'The database was created with funding from NSF EIA-0196217.""",
+                                      'details' : """CMU Motion Capture data base. Captured by a Vicon motion capture system consisting of 12 infrared MX-40 cameras, each of which is capable of recording at 120 Hz with images of 4 megapixel resolution. Motions are captured in a working volume of approximately 3m x 8m. The capture subject wears 41 markers and a stylish black garment.""",
+                                      'license' : """From http://mocap.cs.cmu.edu. This data is free for use in research projects. You may include this data in commercially-sold products, but you may not resell this data directly, even in converted form. If you publish results obtained using this data, we would appreciate it if you would send the citation to your published paper to jkh+mocap@cs.cmu.edu, and also would add this text to your acknowledgments section: The data used in this project was obtained from mocap.cs.cmu.edu. The database was created with funding from NSF EIA-0196217.""",
+                                      'size' : None},
                  'creep_rupture' : {'urls' : ['http://www.msm.cam.ac.uk/map/data/tar/'],
                                     'files' : [['creeprupt.tar']],
                                     'citation' : 'Materials Algorithms Project Data Library: MAP_DATA_CREEP_RUPTURE. F. Brun and T. Yoshida.',
@ -120,8 +120,49 @@ The database was created with funding from NSF EIA-0196217.""",
                                        'details' : """Accelerometer pen data used for robust regression by Tipping and Lawrence.""",
                                        'citation' : 'Michael E. Tipping and Neil D. Lawrence. Variational inference for Student-t models: Robust Bayesian interpolation and generalised component analysis. Neurocomputing, 69:123--141, 2005',
                                        'license' : None,
-                                        'size' : 3410}
+                                        'size' : 3410},
+                  'hapmap3' : {'urls' : ['http://hapmap.ncbi.nlm.nih.gov/downloads/genotypes/latest_phaseIII_ncbi_b36/plink_format/'],
+                                 'files' : [['hapmap3_r2_b36_fwd.consensus.qc.poly.map.bz2', 'hapmap3_r2_b36_fwd.consensus.qc.poly.ped.bz2', 'relationships_w_pops_121708.txt']],
+                                 'details' : """
+        HapMap Project: Single Nucleotide Polymorphism sequenced in all human populations. 
+        The HapMap phase three SNP dataset - 1184 samples out of 11 populations.
+        See http://www.nature.com/nature/journal/v426/n6968/abs/nature02168.html for details.
+
+        SNP_matrix (A) encoding [see Paschou et all. 2007 (PCA-Correlated SNPs...)]:
+        Let (B1,B2) be the alphabetically sorted bases, which occur in the j-th SNP, then
+
+              /  1, iff SNPij==(B1,B1)
+        Aij = |  0, iff SNPij==(B1,B2)
+              \ -1, iff SNPij==(B2,B2)
+
+        The SNP data and the meta information (such as iid, sex and phenotype) are
+        stored in the dataframe datadf, index is the Individual ID, 
+        with following columns for metainfo:
+
+            * family_id   -> Family ID
+            * paternal_id -> Paternal ID
+            * maternal_id -> Maternal ID
+            * sex         -> Sex (1=male; 2=female; other=unknown)
+            * phenotype   -> Phenotype (-9, or 0 for unknown)
+            * population  -> Population string (e.g. 'ASW' - 'YRI')
+            * rest are SNP rs (ids)
+
+        More information is given in infodf:
+
+            * Chromosome:
+                - autosomal chromosemes                -> 1-22
+                - X    X chromosome                    -> 23
+                - Y    Y chromosome                    -> 24
+                - XY   Pseudo-autosomal region of X    -> 25
+                - MT   Mitochondrial                   -> 26
+            * Relative Positon (to Chromosome) [base pairs]
+
+        """,
+                                 'citation': """Gibbs, Richard A., et al. "The international HapMap project." Nature 426.6968 (2003): 789-796.""",
+                                 'license' : """International HapMap Project Public Access License (http://hapmap.ncbi.nlm.nih.gov/cgi-perl/registration#licence)""",
+                                 'size' : 2*1729092237 + 62265},
                  }

-with open('data_resources.json', 'w') as file:
-    json.dump(data_resources, file)
+with open('data_resources.json', 'w') as f:
+    print "writing data_resources"
+    json.dump(data_resources, f)