[datasets] deng et all, labels revisited

2026-06-11 15:15:15 +02:00 · 2014-10-10 11:07:07 +01:00 · 2014-10-10 11:07:07 +01:00 · 9b3498a912
commit 9b3498a912
parent c128c6f948
1 changed files with 17 additions and 3 deletions
--- a/GPy/util/datasets.py
+++ b/GPy/util/datasets.py
@ -964,7 +964,7 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
    if not data_available(dataset):
        download_data(dataset)

-    from pandas import read_csv
+    from pandas import read_csv, isnull
    dir_path = os.path.join(data_path, dataset)

    # read the info .soft
@ -983,6 +983,21 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
    c[1:4] = ['strain', 'cross', 'developmental_stage']
    sample_info.columns = c

+    # get the labels right:
+    rep = re.compile('\(.*\)')
+    def filter_dev_stage(row):
+        if isnull(row):
+            row = "2-cell stage embryo"
+        if row.startswith("developmental stage: "):
+            row = row[len("developmental stage: "):]
+        if row == 'adult':
+            row += " liver"
+        row = row.replace(' stage ', ' ')
+        row = rep.sub(' ', row)
+        row = row.strip(' ')
+        return row
+    labels = sample_info.developmental_stage.apply(filter_dev_stage)
+
    # Extract the tar file
    filename = os.path.join(dir_path, 'GSE45719_Raw.tar')
    with tarfile.open(filename, 'r') as files:
@ -1016,8 +1031,7 @@ def singlecell_rna_seq_deng(dataset='singlecell_deng'):
    sample_info.index = data.index

    # get the labels from the description
-    rep = re.compile('fibroblast|\d+-cell|embryo|liver|blastocyst|blastomere|zygote', re.IGNORECASE)
-    labels = sample_info.developmental_stage.apply(lambda row: " ".join(rep.findall(row)))
+    #rep = re.compile('fibroblast|\d+-cell|embryo|liver|early blastocyst|mid blastocyst|late blastocyst|blastomere|zygote', re.IGNORECASE)

    sys.stdout.write(' '*len(message) + '\r')
    sys.stdout.flush()