From 9b3498a9129044898414987e286a1bf449fbef7f Mon Sep 17 00:00:00 2001 From: Max Zwiessele Date: Fri, 10 Oct 2014 11:07:07 +0100 Subject: [PATCH] [datasets] deng et all, labels revisited --- GPy/util/datasets.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/GPy/util/datasets.py b/GPy/util/datasets.py index c14ce66f..254639a6 100644 --- a/GPy/util/datasets.py +++ b/GPy/util/datasets.py @@ -964,7 +964,7 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'): if not data_available(dataset): download_data(dataset) - from pandas import read_csv + from pandas import read_csv, isnull dir_path = os.path.join(data_path, dataset) # read the info .soft @@ -983,6 +983,21 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'): c[1:4] = ['strain', 'cross', 'developmental_stage'] sample_info.columns = c + # get the labels right: + rep = re.compile('\(.*\)') + def filter_dev_stage(row): + if isnull(row): + row = "2-cell stage embryo" + if row.startswith("developmental stage: "): + row = row[len("developmental stage: "):] + if row == 'adult': + row += " liver" + row = row.replace(' stage ', ' ') + row = rep.sub(' ', row) + row = row.strip(' ') + return row + labels = sample_info.developmental_stage.apply(filter_dev_stage) + # Extract the tar file filename = os.path.join(dir_path, 'GSE45719_Raw.tar') with tarfile.open(filename, 'r') as files: @@ -1016,8 +1031,7 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'): sample_info.index = data.index # get the labels from the description - rep = re.compile('fibroblast|\d+-cell|embryo|liver|blastocyst|blastomere|zygote', re.IGNORECASE) - labels = sample_info.developmental_stage.apply(lambda row: " ".join(rep.findall(row))) + #rep = re.compile('fibroblast|\d+-cell|embryo|liver|early blastocyst|mid blastocyst|late blastocyst|blastomere|zygote', re.IGNORECASE) sys.stdout.write(' '*len(message) + '\r') sys.stdout.flush()