diff --git a/GPy/core/model.py b/GPy/core/model.py index e2cab94b..3acb64b9 100644 --- a/GPy/core/model.py +++ b/GPy/core/model.py @@ -253,7 +253,7 @@ class Model(Parameterized): def _checkgrad(self, target_param=None, verbose=False, step=1e-6, tolerance=1e-3): """ Check the gradient of the ,odel by comparing to a numerical - estimate. If the verbose flag is passed, invividual + estimate. If the verbose flag is passed, individual components are tested (and printed) :param verbose: If True, print a "full" checking of each parameter diff --git a/GPy/examples/dimensionality_reduction.py b/GPy/examples/dimensionality_reduction.py index d0e35b71..7d9e320b 100644 --- a/GPy/examples/dimensionality_reduction.py +++ b/GPy/examples/dimensionality_reduction.py @@ -37,7 +37,7 @@ def bgplvm_test_model(optimize=False, verbose=1, plot=False, output_dim=200, nan # k = GPy.kern.RBF(input_dim, .5, _np.ones(input_dim) * 2., ARD=True) + GPy.kern.linear(input_dim, _np.ones(input_dim) * .2, ARD=True) p = .3 - + m = GPy.models.BayesianGPLVM(Y, input_dim, kernel=k, num_inducing=num_inducing) if nan: @@ -144,7 +144,7 @@ def swiss_roll(optimize=True, verbose=1, plot=True, N=1000, num_inducing=25, Q=4 m = BayesianGPLVM(Y, Q, X=X, X_variance=S, num_inducing=num_inducing, Z=Z, kernel=kernel) m.data_colors = c m.data_t = t - + if optimize: m.optimize('bfgs', messages=verbose, max_iters=2e3) @@ -169,7 +169,7 @@ def bgplvm_oil(optimize=True, verbose=1, plot=True, N=200, Q=7, num_inducing=40, Y = data['X'][:N] m = GPy.models.BayesianGPLVM(Y, Q, kernel=kernel, num_inducing=num_inducing, **k) m.data_labels = data['Y'][:N].argmax(axis=1) - + if optimize: m.optimize('bfgs', messages=verbose, max_iters=max_iters, gtol=.05) @@ -304,7 +304,7 @@ def bgplvm_simulation_missing_data(optimize=True, verbose=1, inan = _np.random.binomial(1, .6, size=Y.shape).astype(bool) Y[inan] = _np.nan - m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, + m = BayesianGPLVM(Y.copy(), Q, init="random", num_inducing=num_inducing, inference_method=VarDTCMissingData(inan=inan), kernel=k) m.X.variance[:] = _np.random.uniform(0,.01,m.X.shape) @@ -364,7 +364,7 @@ def mrd_simulation_missing_data(optimize=True, verbose=True, plot=True, plot_sim for inan in inanlist: imlist.append(VarDTCMissingData(limit=1, inan=inan)) - m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, + m = MRD(Ylist, input_dim=Q, num_inducing=num_inducing, kernel=k, inference_method=imlist, initx="random", initz='permute', **kw) @@ -410,11 +410,11 @@ def olivetti_faces(optimize=True, verbose=True, plot=True): Yn /= Yn.std() m = GPy.models.BayesianGPLVM(Yn, Q, num_inducing=20) - + if optimize: m.optimize('bfgs', messages=verbose, max_iters=1000) if plot: ax = m.plot_latent(which_indices=(0, 1)) - y = m.likelihood.Y[0, :] + y = m.Y[0, :] data_show = GPy.plotting.matplot_dep.visualize.image_show(y[None, :], dimensions=(112, 92), transpose=False, invert=False, scale=False) lvm = GPy.plotting.matplot_dep.visualize.lvm(m.X.mean[0, :].copy(), m, data_show, ax) raw_input('Press enter to finish') @@ -514,7 +514,7 @@ def stick_bgplvm(model=None, optimize=True, verbose=True, plot=True): data = GPy.util.datasets.osu_run1() Q = 6 - kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True) + kernel = GPy.kern.RBF(Q, lengthscale=np.repeat(.5, Q), ARD=True) m = BayesianGPLVM(data['Y'], Q, init="PCA", num_inducing=20, kernel=kernel) m.data = data @@ -566,7 +566,7 @@ def ssgplvm_simulation_linear(): import GPy N, D, Q = 1000, 20, 5 pi = 0.2 - + def sample_X(Q, pi): x = np.empty(Q) dies = np.random.rand(Q) @@ -576,7 +576,7 @@ def ssgplvm_simulation_linear(): else: x[q] = 0. return x - + Y = np.empty((N,D)) X = np.empty((N,Q)) # Generate data from random sampled weight matrices @@ -584,4 +584,4 @@ def ssgplvm_simulation_linear(): X[n] = sample_X(Q,pi) w = np.random.randn(D,Q) Y[n] = np.dot(w,X[n]) - + diff --git a/GPy/inference/latent_function_inference/var_dtc.py b/GPy/inference/latent_function_inference/var_dtc.py index ebc5aa88..06a41d1d 100644 --- a/GPy/inference/latent_function_inference/var_dtc.py +++ b/GPy/inference/latent_function_inference/var_dtc.py @@ -230,7 +230,7 @@ class VarDTCMissingData(LatentFunctionInference): size = len(csa) for i, (v,ind) in enumerate(csa.iteritems()): if not np.all(v): - logger.info('preparing subarrays {:.3%}'.format((i+1.)/size)) + logger.info('preparing subarrays {:3.3%}'.format((i+1.)/size)) v = ~np.array(v, dtype=bool) ind = np.array(ind, dtype=int) if ind.size == Y.shape[1]: diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index cc0cfc49..36c1a481 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -51,7 +51,7 @@ if not (on_rtd): json_data=open(path).read() football_dict = json.loads(json_data) - + def prompt_user(prompt): """Ask user for agreeing to data set licenses.""" @@ -128,14 +128,14 @@ def download_url(url, store_directory, save_name = None, messages = True, suffix f.write(buff) sys.stdout.write(" "*(len(status)) + "\r") if file_size: - status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1048576.), - full=file_size/(1048576.), ll=line_length, + status = r"[{perc: <{ll}}] {dl:7.3f}/{full:.3f}MB".format(dl=file_size_dl/(1048576.), + full=file_size/(1048576.), ll=line_length, perc="="*int(line_length*float(file_size_dl)/file_size)) else: - status = r"[{perc: <{ll}}] {dl:7.3f}MB".format(dl=file_size_dl/(1048576.), - ll=line_length, + status = r"[{perc: <{ll}}] {dl:7.3f}MB".format(dl=file_size_dl/(1048576.), + ll=line_length, perc="."*int(line_length*float(file_size_dl/(10*1048576.)))) - + sys.stdout.write(status) sys.stdout.flush() sys.stdout.write(" "*(len(status)) + "\r") @@ -320,7 +320,7 @@ def della_gatta_TRP63_gene_expression(data_set='della_gatta', gene_number=None): Y = Y[:, None] return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set) - + def football_data(season='1314', data_set='football_data'): """Football data from English games since 1993. This downloads data from football-data.co.uk for the given season. """ @@ -406,11 +406,11 @@ def lee_yeast_ChIP(data_set='lee_yeast_ChIP'): dir_path = os.path.join(data_path, data_set) filename = os.path.join(dir_path, 'binding_by_gene.tsv') S = read_csv(filename, header=1, index_col=0, sep='\t') - transcription_factors = [col for col in S.columns if col[:7] != 'Unnamed'] + transcription_factors = [col for col in S.columns if col[:7] != 'Unnamed'] annotations = S[['Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3']] S = S[transcription_factors] return data_details_return({'annotations' : annotations, 'Y' : S, 'transcription_factors': transcription_factors}, data_set) - + def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): @@ -425,7 +425,7 @@ def fruitfly_tomancak(data_set='fruitfly_tomancak', gene_number=None): xt = np.linspace(0, num_time-1, num_time) xr = np.linspace(0, num_repeats-1, num_repeats) xtime, xrepeat = np.meshgrid(xt, xr) - X = np.vstack((xtime.flatten(), xrepeat.flatten())).T + X = np.vstack((xtime.flatten(), xrepeat.flatten())).T return data_details_return({'X': X, 'Y': Y, 'gene_number' : gene_number}, data_set) def drosophila_protein(data_set='drosophila_protein'): @@ -467,7 +467,7 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'], """Data downloaded from Google trends for given query terms. Warning, if you use this function multiple times in a row you get blocked due to terms of service violations. The function will cache the result of your query, if you wish to refresh an old query set refresh_data to True. The function is inspired by this notebook: http://nbviewer.ipython.org/github/sahuguet/notebooks/blob/master/GoogleTrends%20meet%20Notebook.ipynb""" query_terms.sort() import pandas - + # Create directory name for data dir_path = os.path.join(data_path,'google_trends') if not os.path.isdir(dir_path): @@ -514,9 +514,9 @@ def google_trends(query_terms=['big data', 'machine learning', 'data science'], X = np.asarray([(row, i) for i in range(terms) for row in df.index]) Y = np.asarray([[df.ix[row][query_terms[i]]] for i in range(terms) for row in df.index ]) output_info = columns[1:] - + return data_details_return({'data frame' : df, 'X': X, 'Y': Y, 'query_terms': output_info, 'info': "Data downloaded from google trends with query terms: " + ', '.join(output_info) + '.'}, data_set) - + # The data sets def oil(data_set='three_phase_oil_flow'): """The three phase oil data from Bishop and James (1993).""" @@ -647,7 +647,7 @@ def decampos_digits(data_set='decampos_characters', which_digits=[0,1,2,3,4,5,6, lbls = np.array([[l]*num_samples for l in which_digits]).reshape(Y.shape[0], 1) str_lbls = np.array([[str(l)]*num_samples for l in which_digits]) return data_details_return({'Y': Y, 'lbls': lbls, 'str_lbls' : str_lbls, 'info': 'Digits data set from the de Campos characters data'}, data_set) - + def ripley_synth(data_set='ripley_prnn_data'): if not data_available(data_set): download_data(data_set) @@ -674,7 +674,7 @@ def mauna_loa(data_set='mauna_loa', num_train=545, refresh_data=False): Y = allY[:num_train, 0:1] Ytest = allY[num_train:, 0:1] return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Mauna Loa data with " + str(num_train) + " values used as training points."}, data_set) - + def boxjenkins_airline(data_set='boxjenkins_airline', num_train=96): path = os.path.join(data_path, data_set) @@ -686,7 +686,7 @@ def boxjenkins_airline(data_set='boxjenkins_airline', num_train=96): Xtest = data[num_train:, 0:1] Ytest = data[num_train:, 1:2] return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'info': "Montly airline passenger data from Box & Jenkins 1976."}, data_set) - + def osu_run1(data_set='osu_run1', sample_every=4): path = os.path.join(data_path, data_set) @@ -725,7 +725,7 @@ def hapmap3(data_set='hapmap3'): \ -1, iff SNPij==(B2,B2) The SNP data and the meta information (such as iid, sex and phenotype) are - stored in the dataframe datadf, index is the Individual ID, + stored in the dataframe datadf, index is the Individual ID, with following columns for metainfo: * family_id -> Family ID @@ -798,7 +798,7 @@ def hapmap3(data_set='hapmap3'): status=write_status('unpacking...', curr, status) os.remove(filepath) status=write_status('reading .ped...', 25, status) - # Preprocess data: + # Preprocess data: snpstrnp = np.loadtxt(unpacked_files[0], dtype=str) status=write_status('reading .map...', 33, status) mapnp = np.loadtxt(unpacked_files[1], dtype=str) @@ -959,7 +959,7 @@ def olivetti_glasses(data_set='olivetti_glasses', num_training=200, seed=default Y = y[index[:num_training],:] Ytest = y[index[num_training:]] return data_details_return({'X': X, 'Y': Y, 'Xtest': Xtest, 'Ytest': Ytest, 'seed' : seed, 'info': "ORL Faces with labels identifiying who is wearing glasses and who isn't. Data is randomly partitioned according to given seed. Presence or absence of glasses was labelled by James Hensman."}, 'olivetti_faces') - + def olivetti_faces(data_set='olivetti_faces'): path = os.path.join(data_path, data_set) if not data_available(data_set): @@ -972,7 +972,8 @@ def olivetti_faces(data_set='olivetti_faces'): for subject in range(40): for image in range(10): image_path = os.path.join(path, 'orl_faces', 's'+str(subject+1), str(image+1) + '.pgm') - Y.append(GPy.util.netpbmfile.imread(image_path).flatten()) + from GPy.util import netpbmfile + Y.append(netpbmfile.imread(image_path).flatten()) lbls.append(subject) Y = np.asarray(Y) lbls = np.asarray(lbls)[:, None] @@ -1195,7 +1196,7 @@ def cifar10_patches(data_set='cifar-10'): for x in range(0,32-5,5): for y in range(0,32-5,5): patches = np.concatenate((patches, images[:,x:x+5,y:y+5,:]), axis=0) - patches = patches.reshape((patches.shape[0],-1)) + patches = patches.reshape((patches.shape[0],-1)) return data_details_return({'Y': patches, "info" : "32x32 pixel patches extracted from the CIFAR-10 data by Boris Babenko to demonstrate k-means features."}, data_set) def cmu_mocap_49_balance(data_set='cmu_mocap'):